Fix: GraphRAG and RAPTOR tasks do not affect document status (#11194)

### What problem does this PR solve? GraphRAG and RAPTOR tasks do not affect document status. ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue)
2026-02-06 18:45:08 +08:00 · 2025-11-12 12:03:41 +08:00
parent bacc9d3ab9
commit 8ae562504b
3 changed files with 32 additions and 14 deletions
--- a/api/db/init.py
+++ b/api/db/init.py
@ -70,4 +70,7 @@ class PipelineTaskType(StrEnum):
 VALID_PIPELINE_TASK_TYPES = {PipelineTaskType.PARSE, PipelineTaskType.DOWNLOAD, PipelineTaskType.RAPTOR, PipelineTaskType.GRAPH_RAG, PipelineTaskType.MINDMAP}


+PIPELINE_SPECIAL_PROGRESS_FREEZE_TASK_TYPES = {PipelineTaskType.RAPTOR.lower(), PipelineTaskType.GRAPH_RAG.lower(), PipelineTaskType.MINDMAP.lower()}
+
+
 KNOWLEDGEBASE_FOLDER_NAME=".knowledgebase"
--- a/api/db/services/document_service.py
+++ b/api/db/services/document_service.py
@ -27,7 +27,7 @@ import xxhash
 from peewee import fn, Case, JOIN

 from api.constants import IMG_BASE64_PREFIX, FILE_NAME_LEN_LIMIT
-from api.db import FileType, UserTenantRole, CanvasCategory
+from api.db import PIPELINE_SPECIAL_PROGRESS_FREEZE_TASK_TYPES, FileType, UserTenantRole, CanvasCategory
 from api.db.db_models import DB, Document, Knowledgebase, Task, Tenant, UserTenant, File2Document, File, UserCanvas, \
    User
 from api.db.db_utils import bulk_insert_into_db
@ -372,12 +372,16 @@ class DocumentService(CommonService):
    def get_unfinished_docs(cls):
        fields = [cls.model.id, cls.model.process_begin_at, cls.model.parser_config, cls.model.progress_msg,
                  cls.model.run, cls.model.parser_id]
+        unfinished_task_query = Task.select(Task.doc_id).where(
+            (Task.progress >= 0) & (Task.progress < 1)
+        )
+
        docs = cls.model.select(*fields) \
            .where(
            cls.model.status == StatusEnum.VALID.value,
            ~(cls.model.type == FileType.VIRTUAL.value),
-            cls.model.progress < 1,
-            cls.model.progress > 0)
+            (((cls.model.progress < 1) & (cls.model.progress > 0)) |
+             (cls.model.id.in_(unfinished_task_query)))) # including unfinished tasks like GraphRAG, RAPTOR and Mindmap
        return list(docs.dicts())

    @classmethod
@ -619,13 +623,17 @@ class DocumentService(CommonService):

    @classmethod
    @DB.connection_context()
-    def begin2parse(cls, docid):
-        cls.update_by_id(
-            docid, {"progress": random.random() * 1 / 100.,
-                    "progress_msg": "Task is queued...",
-                    "process_begin_at": get_format_time(),
-                    "run": TaskStatus.RUNNING.value
-                    })
+    def begin2parse(cls, doc_id, keep_progress=False):
+        info = {
+            "progress_msg": "Task is queued...",
+            "process_begin_at": get_format_time(),
+        }
+        if not keep_progress:
+            info["progress"] = random.random() * 1 / 100.
+            info["run"] = TaskStatus.RUNNING.value
+            # keep the doc in DONE state when keep_progress=True for GraphRAG, RAPTOR and Mindmap tasks
+
+        cls.update_by_id(doc_id, info)

    @classmethod
    @DB.connection_context()
@ -684,8 +692,13 @@ class DocumentService(CommonService):
                bad = 0
                e, doc = DocumentService.get_by_id(d["id"])
                status = doc.run  # TaskStatus.RUNNING.value
+                doc_progress = doc.progress if doc and doc.progress else 0.0
+                special_task_running = False
                priority = 0
                for t in tsks:
+                    task_type = (t.task_type or "").lower()
+                    if task_type in PIPELINE_SPECIAL_PROGRESS_FREEZE_TASK_TYPES:
+                        special_task_running = True
                    if 0 <= t.progress < 1:
                        finished = False
                    if t.progress == -1:
@ -702,13 +715,15 @@ class DocumentService(CommonService):
                    prg = 1
                    status = TaskStatus.DONE.value

+                # only for special task and parsed docs and unfinised
+                freeze_progress = special_task_running and doc_progress >= 1 and not finished
                msg = "\n".join(sorted(msg))
                info = {
                    "process_duration": datetime.timestamp(
                        datetime.now()) -
                                       d["process_begin_at"].timestamp(),
                    "run": status}
-                if prg != 0:
+                if prg != 0 and not freeze_progress:
                    info["progress"] = prg
                if msg:
                    info["progress_msg"] = msg
@ -858,7 +873,7 @@ def queue_raptor_o_graphrag_tasks(sample_doc_id, ty, priority, fake_doc_id="", d

    task["doc_id"] = fake_doc_id
    task["doc_ids"] = doc_ids
-    DocumentService.begin2parse(sample_doc_id["id"])
+    DocumentService.begin2parse(sample_doc_id["id"], keep_progress=True)
    assert REDIS_CONN.queue_product(settings.get_svr_queue_name(priority), message=task), "Can't access Redis. Please check the Redis' status."
    return task["id"]

@ -1012,4 +1027,3 @@ def doc_upload_and_parse(conversation_id, file_objs, user_id):
            doc_id, kb.id, token_counts[doc_id], chunk_counts[doc_id], 0)

    return [d["id"] for d, _ in files]
-
--- a/rag/svr/task_executor.py
+++ b/rag/svr/task_executor.py
@ -24,6 +24,7 @@ import time

 import json_repair

+from api.db import PIPELINE_SPECIAL_PROGRESS_FREEZE_TASK_TYPES
 from api.db.services.knowledgebase_service import KnowledgebaseService
 from api.db.services.pipeline_operation_log_service import PipelineOperationLogService
 from common.connection_utils import timeout
@ -192,7 +193,7 @@ async def collect():
    canceled = False
    if msg.get("doc_id", "") in [GRAPH_RAPTOR_FAKE_DOC_ID, CANVAS_DEBUG_DOC_ID]:
        task = msg
-        if task["task_type"] in ["graphrag", "raptor", "mindmap"]:
+        if task["task_type"] in PIPELINE_SPECIAL_PROGRESS_FREEZE_TASK_TYPES:
            task = TaskService.get_task(msg["id"], msg["doc_ids"])
            if task:
                task["doc_id"] = msg["doc_id"]