Feat: add foundational support for GraphRAG dataset pipeline logs (#10264)

### What problem does this PR solve?

Add foundational support for GraphRAG dataset pipeline logs

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
This commit is contained in:
Yongteng Lei
2025-09-25 09:35:50 +08:00
committed by GitHub
parent a6039cf563
commit 840b2b5809
10 changed files with 469 additions and 36 deletions

View File

@ -507,6 +507,9 @@ class DocumentService(CommonService):
@classmethod
@DB.connection_context()
def get_doc_id_by_doc_name(cls, doc_name):
"""
highly rely on the strict deduplication guarantee from Document
"""
fields = [cls.model.id]
doc_id = cls.model.select(*fields) \
.where(cls.model.name == doc_name)
@ -656,6 +659,7 @@ class DocumentService(CommonService):
queue_raptor_o_graphrag_tasks(d, "graphrag", priority)
prg = 0.98 * len(tsks) / (len(tsks) + 1)
else:
prg = 1
status = TaskStatus.DONE.value
msg = "\n".join(sorted(msg))
@ -741,7 +745,11 @@ class DocumentService(CommonService):
"cancelled": int(cancelled),
}
def queue_raptor_o_graphrag_tasks(doc, ty, priority):
def queue_raptor_o_graphrag_tasks(doc, ty, priority, fake_doc_id="", doc_ids=[]):
"""
You can provide a fake_doc_id to bypass the restriction of tasks at the knowledgebase level.
Optionally, specify a list of doc_ids to determine which documents participate in the task.
"""
chunking_config = DocumentService.get_chunking_config(doc["id"])
hasher = xxhash.xxh64()
for field in sorted(chunking_config.keys()):
@ -751,7 +759,7 @@ def queue_raptor_o_graphrag_tasks(doc, ty, priority):
nonlocal doc
return {
"id": get_uuid(),
"doc_id": doc["id"],
"doc_id": fake_doc_id if fake_doc_id else doc["id"],
"from_page": 100000000,
"to_page": 100000000,
"task_type": ty,
@ -764,7 +772,11 @@ def queue_raptor_o_graphrag_tasks(doc, ty, priority):
hasher.update(ty.encode("utf-8"))
task["digest"] = hasher.hexdigest()
bulk_insert_into_db(Task, [task], True)
if ty == "graphrag":
task["doc_ids"] = doc_ids
assert REDIS_CONN.queue_product(get_svr_queue_name(priority), message=task), "Can't access Redis. Please check the Redis' status."
return task["id"]
def get_queue_length(priority):