Feat: refine dataflow and initialize dataflow app (#9952)

### What problem does this PR solve?

Refine dataflow and initialize dataflow app.

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
This commit is contained in:
Yongteng Lei
2025-09-05 18:50:46 +08:00
committed by GitHub
parent 9aa8cfb73a
commit 45f52e85d7
21 changed files with 959 additions and 256 deletions

View File

@ -54,15 +54,15 @@ def trim_header_by_lines(text: str, max_length) -> str:
class TaskService(CommonService):
"""Service class for managing document processing tasks.
This class extends CommonService to provide specialized functionality for document
processing task management, including task creation, progress tracking, and chunk
management. It handles various document types (PDF, Excel, etc.) and manages their
processing lifecycle.
The class implements a robust task queue system with retry mechanisms and progress
tracking, supporting both synchronous and asynchronous task execution.
Attributes:
model: The Task model class for database operations.
"""
@ -72,14 +72,14 @@ class TaskService(CommonService):
@DB.connection_context()
def get_task(cls, task_id):
"""Retrieve detailed task information by task ID.
This method fetches comprehensive task details including associated document,
knowledge base, and tenant information. It also handles task retry logic and
progress updates.
Args:
task_id (str): The unique identifier of the task to retrieve.
Returns:
dict: Task details dictionary containing all task information and related metadata.
Returns None if task is not found or has exceeded retry limit.
@ -139,13 +139,13 @@ class TaskService(CommonService):
@DB.connection_context()
def get_tasks(cls, doc_id: str):
"""Retrieve all tasks associated with a document.
This method fetches all processing tasks for a given document, ordered by page
number and creation time. It includes task progress and chunk information.
Args:
doc_id (str): The unique identifier of the document.
Returns:
list[dict]: List of task dictionaries containing task details.
Returns None if no tasks are found.
@ -170,10 +170,10 @@ class TaskService(CommonService):
@DB.connection_context()
def update_chunk_ids(cls, id: str, chunk_ids: str):
"""Update the chunk IDs associated with a task.
This method updates the chunk_ids field of a task, which stores the IDs of
processed document chunks in a space-separated string format.
Args:
id (str): The unique identifier of the task.
chunk_ids (str): Space-separated string of chunk identifiers.
@ -184,11 +184,11 @@ class TaskService(CommonService):
@DB.connection_context()
def get_ongoing_doc_name(cls):
"""Get names of documents that are currently being processed.
This method retrieves information about documents that are in the processing state,
including their locations and associated IDs. It uses database locking to ensure
thread safety when accessing the task information.
Returns:
list[tuple]: A list of tuples, each containing (parent_id/kb_id, location)
for documents currently being processed. Returns empty list if
@ -238,14 +238,14 @@ class TaskService(CommonService):
@DB.connection_context()
def do_cancel(cls, id):
"""Check if a task should be cancelled based on its document status.
This method determines whether a task should be cancelled by checking the
associated document's run status and progress. A task should be cancelled
if its document is marked for cancellation or has negative progress.
Args:
id (str): The unique identifier of the task to check.
Returns:
bool: True if the task should be cancelled, False otherwise.
"""
@ -311,18 +311,18 @@ class TaskService(CommonService):
def queue_tasks(doc: dict, bucket: str, name: str, priority: int):
"""Create and queue document processing tasks.
This function creates processing tasks for a document based on its type and configuration.
It handles different document types (PDF, Excel, etc.) differently and manages task
chunking and configuration. It also implements task reuse optimization by checking
for previously completed tasks.
Args:
doc (dict): Document dictionary containing metadata and configuration.
bucket (str): Storage bucket name where the document is stored.
name (str): File name of the document.
priority (int, optional): Priority level for task queueing (default is 0).
Note:
- For PDF documents, tasks are created per page range based on configuration
- For Excel documents, tasks are created per row range
@ -410,19 +410,19 @@ def queue_tasks(doc: dict, bucket: str, name: str, priority: int):
def reuse_prev_task_chunks(task: dict, prev_tasks: list[dict], chunking_config: dict):
"""Attempt to reuse chunks from previous tasks for optimization.
This function checks if chunks from previously completed tasks can be reused for
the current task, which can significantly improve processing efficiency. It matches
tasks based on page ranges and configuration digests.
Args:
task (dict): Current task dictionary to potentially reuse chunks for.
prev_tasks (list[dict]): List of previous task dictionaries to check for reuse.
chunking_config (dict): Configuration dictionary for chunk processing.
Returns:
int: Number of chunks successfully reused. Returns 0 if no chunks could be reused.
Note:
Chunks can only be reused if:
- A previous task exists with matching page range and configuration digest
@ -470,3 +470,39 @@ def has_canceled(task_id):
except Exception as e:
logging.exception(e)
return False
def queue_dataflow(dsl:str, tenant_id:str, doc_id:str, task_id:str, flow_id:str, priority: int, callback=None) -> tuple[bool, str]:
"""
Returns a tuple (success: bool, error_message: str).
"""
_ = callback
task = dict(
id=get_uuid() if not task_id else task_id,
doc_id=doc_id,
from_page=0,
to_page=100000000,
task_type="dataflow",
priority=priority,
)
TaskService.model.delete().where(TaskService.model.id == task["id"]).execute()
bulk_insert_into_db(model=Task, data_source=[task], replace_on_conflict=True)
kb_id = DocumentService.get_knowledgebase_id(doc_id)
if not kb_id:
return False, f"Can't find KB of this document: {doc_id}"
task["kb_id"] = kb_id
task["tenant_id"] = tenant_id
task["task_type"] = "dataflow"
task["dsl"] = dsl
task["dataflow_id"] = get_uuid() if not flow_id else flow_id
if not REDIS_CONN.queue_product(
get_svr_queue_name(priority), message=task
):
return False, "Can't access Redis. Please check the Redis' status."
return True, ""