Feat: Use data pipeline to visualize the parsing configuration of the knowledge base (#10423)

### What problem does this PR solve? #9869 ### Type of change - [x] New Feature (non-breaking change which adds functionality) --------- Signed-off-by: dependabot[bot] <support@github.com> Signed-off-by: jinhai <haijin.chn@gmail.com> Signed-off-by: Jin Hai <haijin.chn@gmail.com> Co-authored-by: chanx <1243304602@qq.com> Co-authored-by: balibabu <cike8899@users.noreply.github.com> Co-authored-by: Lynn <lynn_inf@hotmail.com> Co-authored-by: 纷繁下的无奈 <zhileihuang@126.com> Co-authored-by: huangzl <huangzl@shinemo.com> Co-authored-by: writinwaters <93570324+writinwaters@users.noreply.github.com> Co-authored-by: Wilmer <33392318@qq.com> Co-authored-by: Adrian Weidig <adrianweidig@gmx.net> Co-authored-by: Zhichang Yu <yuzhichang@gmail.com> Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> Co-authored-by: Yongteng Lei <yongtengrey@outlook.com> Co-authored-by: Liu An <asiro@qq.com> Co-authored-by: buua436 <66937541+buua436@users.noreply.github.com> Co-authored-by: BadwomanCraZY <511528396@qq.com> Co-authored-by: cucusenok <31804608+cucusenok@users.noreply.github.com> Co-authored-by: Russell Valentine <russ@coldstonelabs.org> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: Billy Bao <newyorkupperbay@gmail.com> Co-authored-by: Zhedong Cen <cenzhedong2@126.com> Co-authored-by: TensorNull <129579691+TensorNull@users.noreply.github.com> Co-authored-by: TensorNull <tensor.null@gmail.com> Co-authored-by: TeslaZY <TeslaZY@outlook.com> Co-authored-by: Ajay <160579663+aybanda@users.noreply.github.com> Co-authored-by: AB <aj@Ajays-MacBook-Air.local> Co-authored-by: 天海蒼灆 <huangaoqin@tecpie.com> Co-authored-by: He Wang <wanghechn@qq.com> Co-authored-by: Atsushi Hatakeyama <atu729@icloud.com> Co-authored-by: Jin Hai <haijin.chn@gmail.com> Co-authored-by: Mohamed Mathari <155896313+melmathari@users.noreply.github.com> Co-authored-by: Mohamed Mathari <nocodeventure@Mac-mini-van-Mohamed.fritz.box> Co-authored-by: Stephen Hu <stephenhu@seismic.com> Co-authored-by: Shaun Zhang <zhangwfjh@users.noreply.github.com> Co-authored-by: zhimeng123 <60221886+zhimeng123@users.noreply.github.com> Co-authored-by: mxc <mxc@example.com> Co-authored-by: Dominik Novotný <50611433+SgtMarmite@users.noreply.github.com> Co-authored-by: EVGENY M <168018528+rjohny55@users.noreply.github.com> Co-authored-by: mcoder6425 <mcoder64@gmail.com> Co-authored-by: lemsn <lemsn@msn.com> Co-authored-by: lemsn <lemsn@126.com> Co-authored-by: Adrian Gora <47756404+adagora@users.noreply.github.com> Co-authored-by: Womsxd <45663319+Womsxd@users.noreply.github.com> Co-authored-by: FatMii <39074672+FatMii@users.noreply.github.com>
2026-01-31 07:36:46 +08:00 · 2025-10-09 12:36:19 +08:00
parent ef0aecea3b
commit cbf04ee470
490 changed files with 10630 additions and 30688 deletions
--- a/api/db/services/task_service.py
+++ b/api/db/services/task_service.py
@ -35,6 +35,8 @@ from rag.utils.redis_conn import REDIS_CONN
 from api import settings
 from rag.nlp import search

+CANVAS_DEBUG_DOC_ID = "dataflow_x"
+GRAPH_RAPTOR_FAKE_DOC_ID = "graph_raptor_x"

 def trim_header_by_lines(text: str, max_length) -> str:
    # Trim header text to maximum length while preserving line breaks
@ -70,7 +72,7 @@ class TaskService(CommonService):

    @classmethod
    @DB.connection_context()
-    def get_task(cls, task_id):
+    def get_task(cls, task_id, doc_ids=[]):
        """Retrieve detailed task information by task ID.

        This method fetches comprehensive task details including associated document,
@ -84,6 +86,10 @@ class TaskService(CommonService):
            dict: Task details dictionary containing all task information and related metadata.
                 Returns None if task is not found or has exceeded retry limit.
        """
+        doc_id = cls.model.doc_id
+        if doc_id == CANVAS_DEBUG_DOC_ID and doc_ids:
+            doc_id = doc_ids[0]
+
        fields = [
            cls.model.id,
            cls.model.doc_id,
@ -109,7 +115,7 @@ class TaskService(CommonService):
        ]
        docs = (
            cls.model.select(*fields)
-                .join(Document, on=(cls.model.doc_id == Document.id))
+                .join(Document, on=(doc_id == Document.id))
                .join(Knowledgebase, on=(Document.kb_id == Knowledgebase.id))
                .join(Tenant, on=(Knowledgebase.tenant_id == Tenant.id))
                .where(cls.model.id == task_id)
@ -292,21 +298,23 @@ class TaskService(CommonService):
                        ((prog == -1) | (prog > cls.model.progress))
                    )
                ).execute()
-            return
+        else:
+            with DB.lock("update_progress", -1):
+                if info["progress_msg"]:
+                    progress_msg = trim_header_by_lines(task.progress_msg + "\n" + info["progress_msg"], 3000)
+                    cls.model.update(progress_msg=progress_msg).where(cls.model.id == id).execute()
+                if "progress" in info:
+                    prog = info["progress"]
+                    cls.model.update(progress=prog).where(
+                        (cls.model.id == id) &
+                        (
+                            (cls.model.progress != -1) &
+                            ((prog == -1) | (prog > cls.model.progress))
+                        )
+                    ).execute()

-        with DB.lock("update_progress", -1):
-            if info["progress_msg"]:
-                progress_msg = trim_header_by_lines(task.progress_msg + "\n" + info["progress_msg"], 3000)
-                cls.model.update(progress_msg=progress_msg).where(cls.model.id == id).execute()
-            if "progress" in info:
-                prog = info["progress"]
-                cls.model.update(progress=prog).where(
-                    (cls.model.id == id) &
-                    (
-                        (cls.model.progress != -1) &
-                        ((prog == -1) | (prog > cls.model.progress))
-                    )
-                ).execute()
+        process_duration = (datetime.now() - task.begin_at).total_seconds()
+        cls.model.update(process_duration=process_duration).where(cls.model.id == id).execute()

    @classmethod
    @DB.connection_context()
@ -336,7 +344,14 @@ def queue_tasks(doc: dict, bucket: str, name: str, priority: int):
        - Previous task chunks may be reused if available
    """
    def new_task():
-        return {"id": get_uuid(), "doc_id": doc["id"], "progress": 0.0, "from_page": 0, "to_page": 100000000}
+        return {
+            "id": get_uuid(),
+            "doc_id": doc["id"],
+            "progress": 0.0,
+            "from_page": 0,
+            "to_page": 100000000,
+            "begin_at": datetime.now(),
+        }

    parse_task_array = []

@ -349,7 +364,7 @@ def queue_tasks(doc: dict, bucket: str, name: str, priority: int):
        page_size = doc["parser_config"].get("task_page_size") or 12
        if doc["parser_id"] == "paper":
            page_size = doc["parser_config"].get("task_page_size") or 22
-        if doc["parser_id"] in ["one", "knowledge_graph"] or do_layout != "DeepDOC":
+        if doc["parser_id"] in ["one", "knowledge_graph"] or do_layout != "DeepDOC" or doc["parser_config"].get("toc", True):
            page_size = 10 ** 9
        page_ranges = doc["parser_config"].get("pages") or [(1, 10 ** 5)]
        for s, e in page_ranges:
@ -478,33 +493,26 @@ def has_canceled(task_id):
    return False


-def queue_dataflow(dsl:str, tenant_id:str, doc_id:str, task_id:str, flow_id:str, priority: int, callback=None) -> tuple[bool, str]:
-    """
-    Returns a tuple (success: bool, error_message: str).
-    """
-    _ = callback
+def queue_dataflow(tenant_id:str, flow_id:str, task_id:str, doc_id:str=CANVAS_DEBUG_DOC_ID, file:dict=None, priority: int=0, rerun:bool=False) -> tuple[bool, str]:

    task = dict(
-    id=get_uuid() if not task_id else task_id,
-    doc_id=doc_id,
-    from_page=0,
-    to_page=100000000,
-    task_type="dataflow",
-    priority=priority,
+        id=task_id,
+        doc_id=doc_id,
+        from_page=0,
+        to_page=100000000,
+        task_type="dataflow" if not rerun else "dataflow_rerun",
+        priority=priority,
+        begin_at=datetime.now(),
    )
-
-    TaskService.model.delete().where(TaskService.model.id == task["id"]).execute()
+    if doc_id not in [CANVAS_DEBUG_DOC_ID, GRAPH_RAPTOR_FAKE_DOC_ID]:
+        TaskService.model.delete().where(TaskService.model.doc_id == doc_id).execute()
+        DocumentService.begin2parse(doc_id)
    bulk_insert_into_db(model=Task, data_source=[task], replace_on_conflict=True)

-    kb_id = DocumentService.get_knowledgebase_id(doc_id)
-    if not kb_id:
-        return False, f"Can't find KB of this document: {doc_id}"
-
-    task["kb_id"] = kb_id
+    task["kb_id"] = DocumentService.get_knowledgebase_id(doc_id)
    task["tenant_id"] = tenant_id
-    task["task_type"] = "dataflow"
-    task["dsl"] = dsl
-    task["dataflow_id"] = get_uuid() if not flow_id else flow_id
+    task["dataflow_id"] = flow_id
+    task["file"] = file

    if not REDIS_CONN.queue_product(
        get_svr_queue_name(priority), message=task