Refa: improve flow of GraphRAG and RAPTOR (#10709)

### What problem does this PR solve? Improve flow of GraphRAG and RAPTOR. ### Type of change - [x] Refactoring
2026-01-31 15:45:08 +08:00 · 2025-10-22 09:29:20 +08:00
parent acc0f7396e
commit 2d491188b8
3 changed files with 22 additions and 17 deletions
--- a/api/db/services/document_service.py
+++ b/api/db/services/document_service.py
@ -671,9 +671,11 @@ class DocumentService(CommonService):
    @classmethod
    @DB.connection_context()
    def _sync_progress(cls, docs:list[dict]):
+        from api.db.services.task_service import TaskService
+
        for d in docs:
            try:
-                tsks = Task.query(doc_id=d["id"], order_by=Task.create_time)
+                tsks = TaskService.query(doc_id=d["id"], order_by=Task.create_time)
                if not tsks:
                    continue
                msg = []
@ -791,21 +793,23 @@ class DocumentService(CommonService):
            "cancelled": int(cancelled),
        }

-def queue_raptor_o_graphrag_tasks(doc, ty, priority, fake_doc_id="", doc_ids=[]):
+def queue_raptor_o_graphrag_tasks(sample_doc_id, ty, priority, fake_doc_id="", doc_ids=[]):
    """
    You can provide a fake_doc_id to bypass the restriction of tasks at the knowledgebase level.
    Optionally, specify a list of doc_ids to determine which documents participate in the task.
    """
-    chunking_config = DocumentService.get_chunking_config(doc["id"])
+    assert ty in ["graphrag", "raptor", "mindmap"], "type should be graphrag, raptor or mindmap"
+
+    chunking_config = DocumentService.get_chunking_config(sample_doc_id["id"])
    hasher = xxhash.xxh64()
    for field in sorted(chunking_config.keys()):
        hasher.update(str(chunking_config[field]).encode("utf-8"))

    def new_task():
-        nonlocal doc
+        nonlocal sample_doc_id
        return {
            "id": get_uuid(),
-            "doc_id": fake_doc_id if fake_doc_id else doc["id"],
+            "doc_id": sample_doc_id["id"],
            "from_page": 100000000,
            "to_page": 100000000,
            "task_type": ty,
@ -820,9 +824,9 @@ def queue_raptor_o_graphrag_tasks(doc, ty, priority, fake_doc_id="", doc_ids=[])
    task["digest"] = hasher.hexdigest()
    bulk_insert_into_db(Task, [task], True)

-    if ty in ["graphrag", "raptor", "mindmap"]:
-        task["doc_ids"] = doc_ids
-        DocumentService.begin2parse(doc["id"])
+    task["doc_id"] = fake_doc_id
+    task["doc_ids"] = doc_ids
+    DocumentService.begin2parse(sample_doc_id["id"])
    assert REDIS_CONN.queue_product(get_svr_queue_name(priority), message=task), "Can't access Redis. Please check the Redis' status."
    return task["id"]