Feat: add foundational support for GraphRAG dataset pipeline logs (#10264)

### What problem does this PR solve? Add foundational support for GraphRAG dataset pipeline logs ### Type of change - [x] New Feature (non-breaking change which adds functionality)
2026-02-03 09:05:07 +08:00 · 2025-09-25 09:35:50 +08:00
parent a6039cf563
commit 840b2b5809
10 changed files with 469 additions and 36 deletions
--- a/api/db/services/pipeline_operation_log_service.py
+++ b/api/db/services/pipeline_operation_log_service.py
@ -31,7 +31,7 @@ class PipelineOperationLogService(CommonService):
    model = PipelineOperationLog

    @classmethod
-    def get_cls_model_fields(cls):
+    def get_file_logs_fields(cls):
        return [
            cls.model.id,
            cls.model.document_id,
@ -59,9 +59,29 @@ class PipelineOperationLogService(CommonService):
            cls.model.update_date,
        ]

+    @classmethod
+    def get_dataset_logs_fields(cls):
+        return [
+            cls.model.id,
+            cls.model.tenant_id,
+            cls.model.kb_id,
+            cls.model.progress,
+            cls.model.progress_msg,
+            cls.model.process_begin_at,
+            cls.model.process_duration,
+            cls.model.task_type,
+            cls.model.operation_status,
+            cls.model.avatar,
+            cls.model.status,
+            cls.model.create_time,
+            cls.model.create_date,
+            cls.model.update_time,
+            cls.model.update_date,
+        ]
+
    @classmethod
    @DB.connection_context()
-    def create(cls, document_id, pipeline_id, task_type):
+    def create(cls, document_id, pipeline_id, task_type, fake_document_ids=[]):
        from rag.flow.pipeline import Pipeline

        tenant_id = ""
@ -69,14 +89,19 @@ class PipelineOperationLogService(CommonService):
        avatar = ""
        dsl = ""
        operation_status = ""
+        referred_document_id = document_id

-        ok, document = DocumentService.get_by_id(document_id)
+        if referred_document_id == "x" and fake_document_ids:
+            referred_document_id = fake_document_ids[0]
+        ok, document = DocumentService.get_by_id(referred_document_id)
        if not ok:
-            raise RuntimeError(f"Document {document_id} not found")
+            raise RuntimeError(f"Document for referred_document_id {referred_document_id} not found")
        DocumentService.update_progress_immediately([document.to_dict()])
-        ok, document = DocumentService.get_by_id(document_id)
+        ok, document = DocumentService.get_by_id(referred_document_id)
        if not ok:
-            raise RuntimeError(f"Document {document_id} not found")
+            raise RuntimeError(f"Document for referred_document_id {referred_document_id} not found")
+        if document.progress not in [1, -1]:
+            return
        operation_status = document.run

        if pipeline_id:
@ -84,7 +109,7 @@ class PipelineOperationLogService(CommonService):
            if not ok:
                raise RuntimeError(f"Pipeline {pipeline_id} not found")

-            pipeline = Pipeline(dsl=json.dumps(user_pipeline.dsl), tenant_id=user_pipeline.user_id, doc_id=document_id, task_id="", flow_id=pipeline_id)
+            pipeline = Pipeline(dsl=json.dumps(user_pipeline.dsl), tenant_id=user_pipeline.user_id, doc_id=referred_document_id, task_id="", flow_id=pipeline_id)

            tenant_id = user_pipeline.user_id
            title = user_pipeline.title
@ -93,7 +118,7 @@ class PipelineOperationLogService(CommonService):
        else:
            ok, kb_info = KnowledgebaseService.get_by_id(document.kb_id)
            if not ok:
-                raise RuntimeError(f"Cannot find knowledge base {document.kb_id} for document {document_id}")
+                raise RuntimeError(f"Cannot find knowledge base {document.kb_id} for referred_document {referred_document_id}")

            tenant_id = kb_info.tenant_id
            title = document.name
@ -104,7 +129,7 @@ class PipelineOperationLogService(CommonService):

        log = dict(
            id=get_uuid(),
-            document_id=document_id,
+            document_id=document_id,  # "x" or real document_id
            tenant_id=tenant_id,
            kb_id=document.kb_id,
            pipeline_id=pipeline_id,
@ -132,18 +157,20 @@ class PipelineOperationLogService(CommonService):

    @classmethod
    @DB.connection_context()
-    def record_pipeline_operation(cls, document_id, pipeline_id, task_type):
-        return cls.create(document_id=document_id, pipeline_id=pipeline_id, task_type=task_type)
+    def record_pipeline_operation(cls, document_id, pipeline_id, task_type, fake_document_ids=[]):
+        return cls.create(document_id=document_id, pipeline_id=pipeline_id, task_type=task_type, fake_document_ids=fake_document_ids)

    @classmethod
    @DB.connection_context()
-    def get_by_kb_id(cls, kb_id, page_number, items_per_page, orderby, desc, keywords, operation_status, types, suffix):
-        fields = cls.get_cls_model_fields()
+    def get_file_logs_by_kb_id(cls, kb_id, page_number, items_per_page, orderby, desc, keywords, operation_status, types, suffix):
+        fields = cls.get_file_logs_fields()
        if keywords:
            logs = cls.model.select(*fields).where((cls.model.kb_id == kb_id), (fn.LOWER(cls.model.document_name).contains(keywords.lower())))
        else:
            logs = cls.model.select(*fields).where(cls.model.kb_id == kb_id)

+        logs = logs.where(cls.model.document_id != "x")
+
        if operation_status:
            logs = logs.where(cls.model.operation_status.in_(operation_status))
        if types:
@ -161,3 +188,23 @@ class PipelineOperationLogService(CommonService):
            logs = logs.paginate(page_number, items_per_page)

        return list(logs.dicts()), count
+
+    @classmethod
+    @DB.connection_context()
+    def get_dataset_logs_by_kb_id(cls, kb_id, page_number, items_per_page, orderby, desc, operation_status):
+        fields = cls.get_dataset_logs_fields()
+        logs = cls.model.select(*fields).where((cls.model.kb_id == kb_id), (cls.model.document_id == "x"))
+
+        if operation_status:
+            logs = logs.where(cls.model.operation_status.in_(operation_status))
+
+        count = logs.count()
+        if desc:
+            logs = logs.order_by(cls.model.getter_by(orderby).desc())
+        else:
+            logs = logs.order_by(cls.model.getter_by(orderby).asc())
+
+        if page_number and items_per_page:
+            logs = logs.paginate(page_number, items_per_page)
+
+        return list(logs.dicts()), count