mirror of
https://github.com/infiniflow/ragflow.git
synced 2025-12-08 20:42:30 +08:00
Feat: refine dataflow and initialize dataflow app (#9952)
### What problem does this PR solve? Refine dataflow and initialize dataflow app. ### Type of change - [x] New Feature (non-breaking change which adds functionality)
This commit is contained in:
@ -54,15 +54,15 @@ def trim_header_by_lines(text: str, max_length) -> str:
|
||||
|
||||
class TaskService(CommonService):
|
||||
"""Service class for managing document processing tasks.
|
||||
|
||||
|
||||
This class extends CommonService to provide specialized functionality for document
|
||||
processing task management, including task creation, progress tracking, and chunk
|
||||
management. It handles various document types (PDF, Excel, etc.) and manages their
|
||||
processing lifecycle.
|
||||
|
||||
|
||||
The class implements a robust task queue system with retry mechanisms and progress
|
||||
tracking, supporting both synchronous and asynchronous task execution.
|
||||
|
||||
|
||||
Attributes:
|
||||
model: The Task model class for database operations.
|
||||
"""
|
||||
@ -72,14 +72,14 @@ class TaskService(CommonService):
|
||||
@DB.connection_context()
|
||||
def get_task(cls, task_id):
|
||||
"""Retrieve detailed task information by task ID.
|
||||
|
||||
|
||||
This method fetches comprehensive task details including associated document,
|
||||
knowledge base, and tenant information. It also handles task retry logic and
|
||||
progress updates.
|
||||
|
||||
|
||||
Args:
|
||||
task_id (str): The unique identifier of the task to retrieve.
|
||||
|
||||
|
||||
Returns:
|
||||
dict: Task details dictionary containing all task information and related metadata.
|
||||
Returns None if task is not found or has exceeded retry limit.
|
||||
@ -139,13 +139,13 @@ class TaskService(CommonService):
|
||||
@DB.connection_context()
|
||||
def get_tasks(cls, doc_id: str):
|
||||
"""Retrieve all tasks associated with a document.
|
||||
|
||||
|
||||
This method fetches all processing tasks for a given document, ordered by page
|
||||
number and creation time. It includes task progress and chunk information.
|
||||
|
||||
|
||||
Args:
|
||||
doc_id (str): The unique identifier of the document.
|
||||
|
||||
|
||||
Returns:
|
||||
list[dict]: List of task dictionaries containing task details.
|
||||
Returns None if no tasks are found.
|
||||
@ -170,10 +170,10 @@ class TaskService(CommonService):
|
||||
@DB.connection_context()
|
||||
def update_chunk_ids(cls, id: str, chunk_ids: str):
|
||||
"""Update the chunk IDs associated with a task.
|
||||
|
||||
|
||||
This method updates the chunk_ids field of a task, which stores the IDs of
|
||||
processed document chunks in a space-separated string format.
|
||||
|
||||
|
||||
Args:
|
||||
id (str): The unique identifier of the task.
|
||||
chunk_ids (str): Space-separated string of chunk identifiers.
|
||||
@ -184,11 +184,11 @@ class TaskService(CommonService):
|
||||
@DB.connection_context()
|
||||
def get_ongoing_doc_name(cls):
|
||||
"""Get names of documents that are currently being processed.
|
||||
|
||||
|
||||
This method retrieves information about documents that are in the processing state,
|
||||
including their locations and associated IDs. It uses database locking to ensure
|
||||
thread safety when accessing the task information.
|
||||
|
||||
|
||||
Returns:
|
||||
list[tuple]: A list of tuples, each containing (parent_id/kb_id, location)
|
||||
for documents currently being processed. Returns empty list if
|
||||
@ -238,14 +238,14 @@ class TaskService(CommonService):
|
||||
@DB.connection_context()
|
||||
def do_cancel(cls, id):
|
||||
"""Check if a task should be cancelled based on its document status.
|
||||
|
||||
|
||||
This method determines whether a task should be cancelled by checking the
|
||||
associated document's run status and progress. A task should be cancelled
|
||||
if its document is marked for cancellation or has negative progress.
|
||||
|
||||
|
||||
Args:
|
||||
id (str): The unique identifier of the task to check.
|
||||
|
||||
|
||||
Returns:
|
||||
bool: True if the task should be cancelled, False otherwise.
|
||||
"""
|
||||
@ -311,18 +311,18 @@ class TaskService(CommonService):
|
||||
|
||||
def queue_tasks(doc: dict, bucket: str, name: str, priority: int):
|
||||
"""Create and queue document processing tasks.
|
||||
|
||||
|
||||
This function creates processing tasks for a document based on its type and configuration.
|
||||
It handles different document types (PDF, Excel, etc.) differently and manages task
|
||||
chunking and configuration. It also implements task reuse optimization by checking
|
||||
for previously completed tasks.
|
||||
|
||||
|
||||
Args:
|
||||
doc (dict): Document dictionary containing metadata and configuration.
|
||||
bucket (str): Storage bucket name where the document is stored.
|
||||
name (str): File name of the document.
|
||||
priority (int, optional): Priority level for task queueing (default is 0).
|
||||
|
||||
|
||||
Note:
|
||||
- For PDF documents, tasks are created per page range based on configuration
|
||||
- For Excel documents, tasks are created per row range
|
||||
@ -410,19 +410,19 @@ def queue_tasks(doc: dict, bucket: str, name: str, priority: int):
|
||||
|
||||
def reuse_prev_task_chunks(task: dict, prev_tasks: list[dict], chunking_config: dict):
|
||||
"""Attempt to reuse chunks from previous tasks for optimization.
|
||||
|
||||
|
||||
This function checks if chunks from previously completed tasks can be reused for
|
||||
the current task, which can significantly improve processing efficiency. It matches
|
||||
tasks based on page ranges and configuration digests.
|
||||
|
||||
|
||||
Args:
|
||||
task (dict): Current task dictionary to potentially reuse chunks for.
|
||||
prev_tasks (list[dict]): List of previous task dictionaries to check for reuse.
|
||||
chunking_config (dict): Configuration dictionary for chunk processing.
|
||||
|
||||
|
||||
Returns:
|
||||
int: Number of chunks successfully reused. Returns 0 if no chunks could be reused.
|
||||
|
||||
|
||||
Note:
|
||||
Chunks can only be reused if:
|
||||
- A previous task exists with matching page range and configuration digest
|
||||
@ -470,3 +470,39 @@ def has_canceled(task_id):
|
||||
except Exception as e:
|
||||
logging.exception(e)
|
||||
return False
|
||||
|
||||
|
||||
def queue_dataflow(dsl:str, tenant_id:str, doc_id:str, task_id:str, flow_id:str, priority: int, callback=None) -> tuple[bool, str]:
|
||||
"""
|
||||
Returns a tuple (success: bool, error_message: str).
|
||||
"""
|
||||
_ = callback
|
||||
|
||||
task = dict(
|
||||
id=get_uuid() if not task_id else task_id,
|
||||
doc_id=doc_id,
|
||||
from_page=0,
|
||||
to_page=100000000,
|
||||
task_type="dataflow",
|
||||
priority=priority,
|
||||
)
|
||||
|
||||
TaskService.model.delete().where(TaskService.model.id == task["id"]).execute()
|
||||
bulk_insert_into_db(model=Task, data_source=[task], replace_on_conflict=True)
|
||||
|
||||
kb_id = DocumentService.get_knowledgebase_id(doc_id)
|
||||
if not kb_id:
|
||||
return False, f"Can't find KB of this document: {doc_id}"
|
||||
|
||||
task["kb_id"] = kb_id
|
||||
task["tenant_id"] = tenant_id
|
||||
task["task_type"] = "dataflow"
|
||||
task["dsl"] = dsl
|
||||
task["dataflow_id"] = get_uuid() if not flow_id else flow_id
|
||||
|
||||
if not REDIS_CONN.queue_product(
|
||||
get_svr_queue_name(priority), message=task
|
||||
):
|
||||
return False, "Can't access Redis. Please check the Redis' status."
|
||||
|
||||
return True, ""
|
||||
|
||||
Reference in New Issue
Block a user