Fix: GraphRAG and RAPTOR tasks do not affect document status (#11194)

### What problem does this PR solve?

GraphRAG and RAPTOR tasks do not affect document status.

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
This commit is contained in:
Yongteng Lei
2025-11-12 12:03:41 +08:00
committed by GitHub
parent bacc9d3ab9
commit 8ae562504b
3 changed files with 32 additions and 14 deletions

View File

@ -70,4 +70,7 @@ class PipelineTaskType(StrEnum):
VALID_PIPELINE_TASK_TYPES = {PipelineTaskType.PARSE, PipelineTaskType.DOWNLOAD, PipelineTaskType.RAPTOR, PipelineTaskType.GRAPH_RAG, PipelineTaskType.MINDMAP} VALID_PIPELINE_TASK_TYPES = {PipelineTaskType.PARSE, PipelineTaskType.DOWNLOAD, PipelineTaskType.RAPTOR, PipelineTaskType.GRAPH_RAG, PipelineTaskType.MINDMAP}
PIPELINE_SPECIAL_PROGRESS_FREEZE_TASK_TYPES = {PipelineTaskType.RAPTOR.lower(), PipelineTaskType.GRAPH_RAG.lower(), PipelineTaskType.MINDMAP.lower()}
KNOWLEDGEBASE_FOLDER_NAME=".knowledgebase" KNOWLEDGEBASE_FOLDER_NAME=".knowledgebase"

View File

@ -27,7 +27,7 @@ import xxhash
from peewee import fn, Case, JOIN from peewee import fn, Case, JOIN
from api.constants import IMG_BASE64_PREFIX, FILE_NAME_LEN_LIMIT from api.constants import IMG_BASE64_PREFIX, FILE_NAME_LEN_LIMIT
from api.db import FileType, UserTenantRole, CanvasCategory from api.db import PIPELINE_SPECIAL_PROGRESS_FREEZE_TASK_TYPES, FileType, UserTenantRole, CanvasCategory
from api.db.db_models import DB, Document, Knowledgebase, Task, Tenant, UserTenant, File2Document, File, UserCanvas, \ from api.db.db_models import DB, Document, Knowledgebase, Task, Tenant, UserTenant, File2Document, File, UserCanvas, \
User User
from api.db.db_utils import bulk_insert_into_db from api.db.db_utils import bulk_insert_into_db
@ -372,12 +372,16 @@ class DocumentService(CommonService):
def get_unfinished_docs(cls): def get_unfinished_docs(cls):
fields = [cls.model.id, cls.model.process_begin_at, cls.model.parser_config, cls.model.progress_msg, fields = [cls.model.id, cls.model.process_begin_at, cls.model.parser_config, cls.model.progress_msg,
cls.model.run, cls.model.parser_id] cls.model.run, cls.model.parser_id]
unfinished_task_query = Task.select(Task.doc_id).where(
(Task.progress >= 0) & (Task.progress < 1)
)
docs = cls.model.select(*fields) \ docs = cls.model.select(*fields) \
.where( .where(
cls.model.status == StatusEnum.VALID.value, cls.model.status == StatusEnum.VALID.value,
~(cls.model.type == FileType.VIRTUAL.value), ~(cls.model.type == FileType.VIRTUAL.value),
cls.model.progress < 1, (((cls.model.progress < 1) & (cls.model.progress > 0)) |
cls.model.progress > 0) (cls.model.id.in_(unfinished_task_query)))) # including unfinished tasks like GraphRAG, RAPTOR and Mindmap
return list(docs.dicts()) return list(docs.dicts())
@classmethod @classmethod
@ -619,13 +623,17 @@ class DocumentService(CommonService):
@classmethod @classmethod
@DB.connection_context() @DB.connection_context()
def begin2parse(cls, docid): def begin2parse(cls, doc_id, keep_progress=False):
cls.update_by_id( info = {
docid, {"progress": random.random() * 1 / 100., "progress_msg": "Task is queued...",
"progress_msg": "Task is queued...", "process_begin_at": get_format_time(),
"process_begin_at": get_format_time(), }
"run": TaskStatus.RUNNING.value if not keep_progress:
}) info["progress"] = random.random() * 1 / 100.
info["run"] = TaskStatus.RUNNING.value
# keep the doc in DONE state when keep_progress=True for GraphRAG, RAPTOR and Mindmap tasks
cls.update_by_id(doc_id, info)
@classmethod @classmethod
@DB.connection_context() @DB.connection_context()
@ -684,8 +692,13 @@ class DocumentService(CommonService):
bad = 0 bad = 0
e, doc = DocumentService.get_by_id(d["id"]) e, doc = DocumentService.get_by_id(d["id"])
status = doc.run # TaskStatus.RUNNING.value status = doc.run # TaskStatus.RUNNING.value
doc_progress = doc.progress if doc and doc.progress else 0.0
special_task_running = False
priority = 0 priority = 0
for t in tsks: for t in tsks:
task_type = (t.task_type or "").lower()
if task_type in PIPELINE_SPECIAL_PROGRESS_FREEZE_TASK_TYPES:
special_task_running = True
if 0 <= t.progress < 1: if 0 <= t.progress < 1:
finished = False finished = False
if t.progress == -1: if t.progress == -1:
@ -702,13 +715,15 @@ class DocumentService(CommonService):
prg = 1 prg = 1
status = TaskStatus.DONE.value status = TaskStatus.DONE.value
# only for special task and parsed docs and unfinised
freeze_progress = special_task_running and doc_progress >= 1 and not finished
msg = "\n".join(sorted(msg)) msg = "\n".join(sorted(msg))
info = { info = {
"process_duration": datetime.timestamp( "process_duration": datetime.timestamp(
datetime.now()) - datetime.now()) -
d["process_begin_at"].timestamp(), d["process_begin_at"].timestamp(),
"run": status} "run": status}
if prg != 0: if prg != 0 and not freeze_progress:
info["progress"] = prg info["progress"] = prg
if msg: if msg:
info["progress_msg"] = msg info["progress_msg"] = msg
@ -858,7 +873,7 @@ def queue_raptor_o_graphrag_tasks(sample_doc_id, ty, priority, fake_doc_id="", d
task["doc_id"] = fake_doc_id task["doc_id"] = fake_doc_id
task["doc_ids"] = doc_ids task["doc_ids"] = doc_ids
DocumentService.begin2parse(sample_doc_id["id"]) DocumentService.begin2parse(sample_doc_id["id"], keep_progress=True)
assert REDIS_CONN.queue_product(settings.get_svr_queue_name(priority), message=task), "Can't access Redis. Please check the Redis' status." assert REDIS_CONN.queue_product(settings.get_svr_queue_name(priority), message=task), "Can't access Redis. Please check the Redis' status."
return task["id"] return task["id"]
@ -1012,4 +1027,3 @@ def doc_upload_and_parse(conversation_id, file_objs, user_id):
doc_id, kb.id, token_counts[doc_id], chunk_counts[doc_id], 0) doc_id, kb.id, token_counts[doc_id], chunk_counts[doc_id], 0)
return [d["id"] for d, _ in files] return [d["id"] for d, _ in files]

View File

@ -24,6 +24,7 @@ import time
import json_repair import json_repair
from api.db import PIPELINE_SPECIAL_PROGRESS_FREEZE_TASK_TYPES
from api.db.services.knowledgebase_service import KnowledgebaseService from api.db.services.knowledgebase_service import KnowledgebaseService
from api.db.services.pipeline_operation_log_service import PipelineOperationLogService from api.db.services.pipeline_operation_log_service import PipelineOperationLogService
from common.connection_utils import timeout from common.connection_utils import timeout
@ -192,7 +193,7 @@ async def collect():
canceled = False canceled = False
if msg.get("doc_id", "") in [GRAPH_RAPTOR_FAKE_DOC_ID, CANVAS_DEBUG_DOC_ID]: if msg.get("doc_id", "") in [GRAPH_RAPTOR_FAKE_DOC_ID, CANVAS_DEBUG_DOC_ID]:
task = msg task = msg
if task["task_type"] in ["graphrag", "raptor", "mindmap"]: if task["task_type"] in PIPELINE_SPECIAL_PROGRESS_FREEZE_TASK_TYPES:
task = TaskService.get_task(msg["id"], msg["doc_ids"]) task = TaskService.get_task(msg["id"], msg["doc_ids"])
if task: if task:
task["doc_id"] = msg["doc_id"] task["doc_id"] = msg["doc_id"]