Feat: add support for pipeline logs operation (#10207)

### What problem does this PR solve? Add support for pipeline logs operation ### Type of change - [x] New Feature (non-breaking change which adds functionality) --------- Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
2026-02-05 18:15:06 +08:00 · 2025-09-23 09:46:31 +08:00
parent d0bfe8b10c
commit 0c557e37ad
8 changed files with 340 additions and 17 deletions
--- a/api/apps/kb_app.py
+++ b/api/apps/kb_app.py
@ -22,10 +22,11 @@ from api.db.services import duplicate_name
 from api.db.services.document_service import DocumentService
 from api.db.services.file2document_service import File2DocumentService
 from api.db.services.file_service import FileService
+from api.db.services.pipeline_operation_log_service import PipelineOperationLogService
 from api.db.services.user_service import TenantService, UserTenantService
 from api.utils.api_utils import server_error_response, get_data_error_result, validate_request, not_allowed_parameters
 from api.utils import get_uuid
-from api.db import StatusEnum, FileSource
+from api.db import StatusEnum, FileSource, VALID_FILE_TYPES
 from api.db.services.knowledgebase_service import KnowledgebaseService
 from api.db.db_models import File
 from api.utils.api_utils import get_json_result
@ -35,7 +36,6 @@ from api.constants import DATASET_NAME_LIMIT
 from rag.settings import PAGERANK_FLD
 from rag.utils.storage_factory import STORAGE_IMPL

-
@manager.route('/create', methods=['post'])  # noqa: F821
@login_required
@validate_request("name")
@ -395,3 +395,84 @@ def get_basic_info():
    basic_info = DocumentService.knowledgebase_basic_info(kb_id)

    return get_json_result(data=basic_info)
+
+
+@manager.route("/list_pipeline_logs", methods=["POST"])  # noqa: F821
+@login_required
+def list_pipeline_logs():
+    kb_id = request.args.get("kb_id")
+    if not kb_id:
+        return get_json_result(data=False, message='Lack of "KB ID"', code=settings.RetCode.ARGUMENT_ERROR)
+
+    keywords = request.args.get("keywords", "")
+
+    page_number = int(request.args.get("page", 0))
+    items_per_page = int(request.args.get("page_size", 0))
+    orderby = request.args.get("orderby", "create_time")
+    if request.args.get("desc", "true").lower() == "false":
+        desc = False
+    else:
+        desc = True
+    create_time_from = int(request.args.get("create_time_from", 0))
+    create_time_to = int(request.args.get("create_time_to", 0))
+
+    req = request.get_json()
+
+    operation_status = req.get("operation_status", [])
+    if operation_status:
+        invalid_status = {s for s in operation_status if s not in ["success", "failed", "running", "pending"]}
+        if invalid_status:
+            return get_data_error_result(message=f"Invalid filter operation_status status conditions: {', '.join(invalid_status)}")
+
+    types = req.get("types", [])
+    if types:
+        invalid_types = {t for t in types if t not in VALID_FILE_TYPES}
+        if invalid_types:
+            return get_data_error_result(message=f"Invalid filter conditions: {', '.join(invalid_types)} type{'s' if len(invalid_types) > 1 else ''}")
+
+    suffix = req.get("suffix", [])
+
+    try:
+        docs, tol = PipelineOperationLogService.get_by_kb_id(kb_id, page_number, items_per_page, orderby, desc, keywords, operation_status, types, suffix)
+
+        if create_time_from or create_time_to:
+            filtered_docs = []
+            for doc in docs:
+                doc_create_time = doc.get("create_time", 0)
+                if (create_time_from == 0 or doc_create_time >= create_time_from) and (create_time_to == 0 or doc_create_time <= create_time_to):
+                    filtered_docs.append(doc)
+            docs = filtered_docs
+
+
+        return get_json_result(data={"total": tol, "docs": docs})
+    except Exception as e:
+        return server_error_response(e)
+
+
+@manager.route("/delete_pipeline_logs", methods=["POST"])  # noqa: F821
+@login_required
+def delete_pipeline_logs():
+    kb_id = request.args.get("kb_id")
+    if not kb_id:
+        return get_json_result(data=False, message='Lack of "KB ID"', code=settings.RetCode.ARGUMENT_ERROR)
+
+    req = request.get_json()
+    log_ids = req.get("log_ids", [])
+
+    PipelineOperationLogService.delete_by_ids(log_ids)
+
+    return get_json_result(data=True)
+
+
+@manager.route("/pipeline_log_detail", methods=["GET"])  # noqa: F821
+@login_required
+def pipeline_log_detail():
+    log_id = request.args.get("log_id")
+    if not log_id:
+        return get_json_result(data=False, message='Lack of "Pipeline log ID"', code=settings.RetCode.ARGUMENT_ERROR)
+
+    ok, log = PipelineOperationLogService.get_by_id(log_id)
+    if not ok:
+        return get_data_error_result(message="Invalid pipeline log ID")
+
+    return get_json_result(data=log.to_dict())
--- a/api/db/init.py
+++ b/api/db/init.py
@ -122,4 +122,12 @@ class MCPServerType(StrEnum):
 VALID_MCP_SERVER_TYPES = {MCPServerType.SSE, MCPServerType.STREAMABLE_HTTP}


+class PipelineTaskType(StrEnum):
+    PARSE = "Parse"
+    DOWNLOAD = "DOWNLOAD"
+
+
+VALID_PIPELINE_TASK_TYPES = {PipelineTaskType.PARSE, PipelineTaskType.DOWNLOAD}
+
+
 KNOWLEDGEBASE_FOLDER_NAME=".knowledgebase"
--- a/api/db/db_models.py
+++ b/api/db/db_models.py
@ -906,6 +906,32 @@ class Search(DataBaseModel):
        db_table = "search"


+class PipelineOperationLog(DataBaseModel):
+    id = CharField(max_length=32, primary_key=True)
+    document_id = CharField(max_length=32, index=True)
+    tenant_id = CharField(max_length=32, null=False, index=True)
+    kb_id = CharField(max_length=32, null=False, index=True)
+    pipeline_id = CharField(max_length=32, null=True, help_text="Pipeline ID", index=True)
+    pipeline_title = CharField(max_length=32, null=True, help_text="Pipeline title", index=True)
+    parser_id = CharField(max_length=32, null=False, help_text="Parser ID", index=True)
+    document_name = CharField(max_length=255, null=False, help_text="File name")
+    document_suffix = CharField(max_length=255, null=False, help_text="File suffix")
+    document_type = CharField(max_length=255, null=False, help_text="Document type")
+    source_from = CharField(max_length=255, null=False, help_text="Source")
+    progress = FloatField(default=0, index=True)
+    progress_msg = TextField(null=True, help_text="process message", default="")
+    process_begin_at = DateTimeField(null=True, index=True)
+    process_duration = FloatField(default=0)
+    dsl = JSONField(null=True, default=dict)
+    task_type = CharField(max_length=32, null=False, default="")
+    operation_status = CharField(max_length=32, null=False, help_text="Operation status")
+    avatar = TextField(null=True, help_text="avatar base64 string")
+    status = CharField(max_length=1, null=True, help_text="is it validate(0: wasted, 1: validate)", default="1", index=True)
+
+    class Meta:
+        db_table = "pipeline_operation_log"
+
+
 def migrate_db():
    logging.disable(logging.ERROR)
    migrator = DatabaseMigrator[settings.DATABASE_TYPE.upper()].value(DB)
--- a/api/db/services/document_service.py
+++ b/api/db/services/document_service.py
@ -597,6 +597,22 @@ class DocumentService(CommonService):
    @DB.connection_context()
    def update_progress(cls):
        docs = cls.get_unfinished_docs()
+
+        cls._sync_progress(docs)
+
+
+    @classmethod
+    @DB.connection_context()
+    def update_progress_immediately(cls, docs:list[dict]):
+        if not docs:
+            return
+
+        cls._sync_progress(docs)
+
+
+    @classmethod
+    @DB.connection_context()
+    def _sync_progress(cls, docs:list[dict]):
        for d in docs:
            try:
                tsks = Task.query(doc_id=d["id"], order_by=Task.create_time)
--- a/api/db/services/pipeline_operation_log_service.py
+++ b/api/db/services/pipeline_operation_log_service.py
@ -0,0 +1,163 @@
+#
+#  Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import json
+from datetime import datetime
+
+from peewee import fn
+
+from api.db import VALID_PIPELINE_TASK_TYPES
+from api.db.db_models import DB, PipelineOperationLog
+from api.db.services.canvas_service import UserCanvasService
+from api.db.services.common_service import CommonService
+from api.db.services.document_service import DocumentService
+from api.db.services.knowledgebase_service import KnowledgebaseService
+from api.utils import current_timestamp, datetime_format, get_uuid
+
+
+class PipelineOperationLogService(CommonService):
+    model = PipelineOperationLog
+
+    @classmethod
+    def get_cls_model_fields(cls):
+        return [
+            cls.model.id,
+            cls.model.document_id,
+            cls.model.tenant_id,
+            cls.model.kb_id,
+            cls.model.pipeline_id,
+            cls.model.pipeline_title,
+            cls.model.parser_id,
+            cls.model.document_name,
+            cls.model.document_suffix,
+            cls.model.document_type,
+            cls.model.source_from,
+            cls.model.progress,
+            cls.model.progress_msg,
+            cls.model.process_begin_at,
+            cls.model.process_duration,
+            cls.model.dsl,
+            cls.model.task_type,
+            cls.model.operation_status,
+            cls.model.avatar,
+            cls.model.status,
+            cls.model.create_time,
+            cls.model.create_date,
+            cls.model.update_time,
+            cls.model.update_date,
+        ]
+
+    @classmethod
+    @DB.connection_context()
+    def create(cls, document_id, pipeline_id, task_type):
+        from rag.flow.pipeline import Pipeline
+
+        tenant_id = ""
+        title = ""
+        avatar = ""
+        dsl = ""
+        operation_status = ""
+
+        ok, document = DocumentService.get_by_id(document_id)
+        if not ok:
+            raise RuntimeError(f"Document {document_id} not found")
+        DocumentService.update_progress_immediately([document.to_dict()])
+        ok, document = DocumentService.get_by_id(document_id)
+        if not ok:
+            raise RuntimeError(f"Document {document_id} not found")
+        operation_status = document.run
+
+        if pipeline_id:
+            ok, user_pipeline = UserCanvasService.get_by_id(pipeline_id)
+            if not ok:
+                raise RuntimeError(f"Pipeline {pipeline_id} not found")
+
+            pipeline = Pipeline(dsl=json.dumps(user_pipeline.dsl), tenant_id=user_pipeline.user_id, doc_id=document_id, task_id="", flow_id=pipeline_id)
+
+            tenant_id = user_pipeline.user_id
+            title = user_pipeline.title
+            avatar = user_pipeline.avatar
+            dsl = json.loads(str(pipeline))
+        else:
+            ok, kb_info = KnowledgebaseService.get_by_id(document.kb_id)
+            if not ok:
+                raise RuntimeError(f"Cannot find knowledge base {document.kb_id} for document {document_id}")
+
+            tenant_id = kb_info.tenant_id
+            title = document.name
+            avatar = document.thumbnail
+
+        if task_type not in VALID_PIPELINE_TASK_TYPES:
+            raise ValueError(f"Invalid task type: {task_type}")
+
+        log = dict(
+            id=get_uuid(),
+            document_id=document_id,
+            tenant_id=tenant_id,
+            kb_id=document.kb_id,
+            pipeline_id=pipeline_id,
+            pipeline_title=title,
+            parser_id=document.parser_id,
+            document_name=document.name,
+            document_suffix=document.suffix,
+            document_type=document.type,
+            source_from="",  # TODO: add in the future
+            progress=document.progress,
+            progress_msg=document.progress_msg,
+            process_begin_at=document.process_begin_at,
+            process_duration=document.process_duration,
+            dsl=dsl,
+            task_type=task_type,
+            operation_status=operation_status,
+            avatar=avatar,
+        )
+        log["create_time"] = current_timestamp()
+        log["create_date"] = datetime_format(datetime.now())
+        log["update_time"] = current_timestamp()
+        log["update_date"] = datetime_format(datetime.now())
+        obj = cls.save(**log)
+        return obj
+
+    @classmethod
+    @DB.connection_context()
+    def record_pipeline_operation(cls, document_id, pipeline_id, task_type):
+        return cls.create(document_id=document_id, pipeline_id=pipeline_id, task_type=task_type)
+
+    @classmethod
+    @DB.connection_context()
+    def get_by_kb_id(cls, kb_id, page_number, items_per_page, orderby, desc, keywords, operation_status, types, suffix):
+        fields = cls.get_cls_model_fields()
+        if keywords:
+            logs = cls.model.select(*fields).where((cls.model.kb_id == kb_id), (fn.LOWER(cls.model.document_name).contains(keywords.lower())))
+        else:
+            logs = cls.model.select(*fields).where(cls.model.kb_id == kb_id)
+
+        if operation_status:
+            logs = logs.where(cls.model.operation_status.in_(operation_status))
+        if types:
+            logs = logs.where(cls.model.document_type.in_(types))
+        if suffix:
+            logs = logs.where(cls.model.document_suffix.in_(suffix))
+
+        count = logs.count()
+        if desc:
+            logs = logs.order_by(cls.model.getter_by(orderby).desc())
+        else:
+            logs = logs.order_by(cls.model.getter_by(orderby).asc())
+
+        if page_number and items_per_page:
+            logs = logs.paginate(page_number, items_per_page)
+
+        return list(logs.dicts()), count
--- a/rag/flow/parser/parser.py
+++ b/rag/flow/parser/parser.py
@ -231,7 +231,6 @@ class Parser(ProcessBase):

        if conf.get("output_format") == "json":
            self.set_output("json", bboxes)
-
        if conf.get("output_format") == "markdown":
            mkdn = ""
            for b in bboxes:
@ -295,6 +294,7 @@ class Parser(ProcessBase):

    def _markdown(self, name, blob):
        from functools import reduce
+
        from rag.app.naive import Markdown as naive_markdown_parser
        from rag.nlp import concat_img

@ -346,7 +346,7 @@ class Parser(ProcessBase):

        else:
            # use VLM to describe the picture
-            cv_model = LLMBundle(self._canvas.get_tenant_id(), LLMType.IMAGE2TEXT, llm_name=conf["llm_id"],lang=lang)
+            cv_model = LLMBundle(self._canvas.get_tenant_id(), LLMType.IMAGE2TEXT, llm_name=conf["llm_id"], lang=lang)
            img_binary = io.BytesIO()
            img.save(img_binary, format="JPEG")
            img_binary.seek(0)
--- a/rag/flow/pipeline.py
+++ b/rag/flow/pipeline.py
@ -19,10 +19,13 @@ import logging
 import random
 import time
 from timeit import default_timer as timer
+
 import trio

 from agent.canvas import Graph
+from api.db import PipelineTaskType
 from api.db.services.document_service import DocumentService
+from api.db.services.pipeline_operation_log_service import PipelineOperationLogService
 from rag.utils.redis_conn import REDIS_CONN


@ -44,22 +47,40 @@ class Pipeline(Graph):
            obj = json.loads(bin.encode("utf-8"))
            if obj:
                if obj[-1]["component_id"] == component_name:
-                    obj[-1]["trace"].append({"progress": progress, "message": message, "datetime": datetime.datetime.now().strftime("%H:%M:%S"), "timestamp": timestamp, "elapsed_time": timestamp-obj[-1]["trace"][-1]["timestamp"]})
+                    obj[-1]["trace"].append(
+                        {
+                            "progress": progress,
+                            "message": message,
+                            "datetime": datetime.datetime.now().strftime("%H:%M:%S"),
+                            "timestamp": timestamp,
+                            "elapsed_time": timestamp - obj[-1]["trace"][-1]["timestamp"],
+                        }
+                    )
                else:
-                    obj.append({"component_id": component_name, "trace": [{"progress": progress, "message": message, "datetime": datetime.datetime.now().strftime("%H:%M:%S"), "timestamp": timestamp, "elapsed_time": 0}]})
+                    obj.append(
+                        {
+                            "component_id": component_name,
+                            "trace": [{"progress": progress, "message": message, "datetime": datetime.datetime.now().strftime("%H:%M:%S"), "timestamp": timestamp, "elapsed_time": 0}],
+                        }
+                    )
            else:
-                obj = [{"component_id": component_name, "trace": [{"progress": progress, "message": message, "datetime": datetime.datetime.now().strftime("%H:%M:%S"), "timestamp": timestamp, "elapsed_time": 0}]}]
+                obj = [
+                    {
+                        "component_id": component_name,
+                        "trace": [{"progress": progress, "message": message, "datetime": datetime.datetime.now().strftime("%H:%M:%S"), "timestamp": timestamp, "elapsed_time": 0}],
+                    }
+                ]
            REDIS_CONN.set_obj(log_key, obj, 60 * 30)
            if self._doc_id:
-                percentage = 1./len(self.components.items())
+                percentage = 1.0 / len(self.components.items())
                msg = ""
-                finished = 0.
+                finished = 0.0
                for o in obj:
-                    if o['component_id'] == "END":
+                    if o["component_id"] == "END":
                        continue
                    msg += f"\n[{o['component_id']}]:\n"
                    for t in o["trace"]:
-                        msg += "%s: %s\n"%(t["datetime"], t["message"])
+                        msg += "%s: %s\n" % (t["datetime"], t["message"])
                        if t["progress"] < 0:
                            finished = -1
                            break
@ -129,8 +150,13 @@ class Pipeline(Graph):
        self.callback("END", 1, json.dumps(self.get_component_obj(self.path[-1]).output(), ensure_ascii=False))

        if self._doc_id:
-            DocumentService.update_by_id(self._doc_id,{
-                "progress": 1 if not self.error else -1,
-                "progress_msg": "Pipeline finished...\n" + self.error,
-                "process_duration": time.perf_counter() - st
-            })
+            DocumentService.update_by_id(
+                self._doc_id,
+                {
+                    "progress": 1 if not self.error else -1,
+                    "progress_msg": "Pipeline finished...\n" + self.error,
+                    "process_duration": time.perf_counter() - st,
+                },
+            )
+
+            PipelineOperationLogService.create(document_id=self._doc_id, pipeline_id=self._flow_id, task_type=PipelineTaskType.PARSE)
--- a/rag/svr/task_executor.py
+++ b/rag/svr/task_executor.py
@ -21,6 +21,7 @@ import sys
 import threading
 import time
 from api.db.services.canvas_service import UserCanvasService
+from api.db.services.pipeline_operation_log_service import PipelineOperationLogService
 from api.utils.api_utils import timeout
 from api.utils.base64_image import image2id
 from api.utils.log_utils import init_root_logger, get_project_base_directory
@ -45,7 +46,7 @@ import exceptiongroup
 import faulthandler
 import numpy as np
 from peewee import DoesNotExist
-from api.db import LLMType, ParserType
+from api.db import LLMType, ParserType, PipelineTaskType
 from api.db.services.document_service import DocumentService
 from api.db.services.llm_service import LLMBundle
 from api.db.services.task_service import TaskService, has_canceled
@ -650,6 +651,7 @@ async def do_handle_task(task):
                                                                                     timer() - start_ts))

    DocumentService.increment_chunk_num(task_doc_id, task_dataset_id, token_count, chunk_count, 0)
+    PipelineOperationLogService.record_pipeline_operation(document_id=task_doc_id, pipeline_id="", task_type=PipelineTaskType.PARSE)

    time_cost = timer() - start_ts
    task_time_cost = timer() - task_start_ts
@ -685,6 +687,7 @@ async def handle_task():
        except Exception:
            pass
        logging.exception(f"handle_task got exception for task {json.dumps(task)}")
+        PipelineOperationLogService.record_pipeline_operation(document_id=task["doc_id"], pipeline_id=task.get("dataflow_id", "") or "", task_type=PipelineTaskType.PARSE)
    redis_msg.ack()