Feat: Use data pipeline to visualize the parsing configuration of the knowledge base (#10423)

### What problem does this PR solve? #9869 ### Type of change - [x] New Feature (non-breaking change which adds functionality) --------- Signed-off-by: dependabot[bot] <support@github.com> Signed-off-by: jinhai <haijin.chn@gmail.com> Signed-off-by: Jin Hai <haijin.chn@gmail.com> Co-authored-by: chanx <1243304602@qq.com> Co-authored-by: balibabu <cike8899@users.noreply.github.com> Co-authored-by: Lynn <lynn_inf@hotmail.com> Co-authored-by: 纷繁下的无奈 <zhileihuang@126.com> Co-authored-by: huangzl <huangzl@shinemo.com> Co-authored-by: writinwaters <93570324+writinwaters@users.noreply.github.com> Co-authored-by: Wilmer <33392318@qq.com> Co-authored-by: Adrian Weidig <adrianweidig@gmx.net> Co-authored-by: Zhichang Yu <yuzhichang@gmail.com> Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> Co-authored-by: Yongteng Lei <yongtengrey@outlook.com> Co-authored-by: Liu An <asiro@qq.com> Co-authored-by: buua436 <66937541+buua436@users.noreply.github.com> Co-authored-by: BadwomanCraZY <511528396@qq.com> Co-authored-by: cucusenok <31804608+cucusenok@users.noreply.github.com> Co-authored-by: Russell Valentine <russ@coldstonelabs.org> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: Billy Bao <newyorkupperbay@gmail.com> Co-authored-by: Zhedong Cen <cenzhedong2@126.com> Co-authored-by: TensorNull <129579691+TensorNull@users.noreply.github.com> Co-authored-by: TensorNull <tensor.null@gmail.com> Co-authored-by: TeslaZY <TeslaZY@outlook.com> Co-authored-by: Ajay <160579663+aybanda@users.noreply.github.com> Co-authored-by: AB <aj@Ajays-MacBook-Air.local> Co-authored-by: 天海蒼灆 <huangaoqin@tecpie.com> Co-authored-by: He Wang <wanghechn@qq.com> Co-authored-by: Atsushi Hatakeyama <atu729@icloud.com> Co-authored-by: Jin Hai <haijin.chn@gmail.com> Co-authored-by: Mohamed Mathari <155896313+melmathari@users.noreply.github.com> Co-authored-by: Mohamed Mathari <nocodeventure@Mac-mini-van-Mohamed.fritz.box> Co-authored-by: Stephen Hu <stephenhu@seismic.com> Co-authored-by: Shaun Zhang <zhangwfjh@users.noreply.github.com> Co-authored-by: zhimeng123 <60221886+zhimeng123@users.noreply.github.com> Co-authored-by: mxc <mxc@example.com> Co-authored-by: Dominik Novotný <50611433+SgtMarmite@users.noreply.github.com> Co-authored-by: EVGENY M <168018528+rjohny55@users.noreply.github.com> Co-authored-by: mcoder6425 <mcoder64@gmail.com> Co-authored-by: lemsn <lemsn@msn.com> Co-authored-by: lemsn <lemsn@126.com> Co-authored-by: Adrian Gora <47756404+adagora@users.noreply.github.com> Co-authored-by: Womsxd <45663319+Womsxd@users.noreply.github.com> Co-authored-by: FatMii <39074672+FatMii@users.noreply.github.com>
2026-01-30 23:26:36 +08:00 · 2025-10-09 12:36:19 +08:00
parent ef0aecea3b
commit cbf04ee470
490 changed files with 10630 additions and 30688 deletions
--- a/api/apps/canvas_app.py
+++ b/api/apps/canvas_app.py
@ -19,15 +19,19 @@ import re
 import sys
 from functools import partial

+import flask
 import trio
 from flask import request, Response
 from flask_login import login_required, current_user

-from agent.component.llm import LLM
+from agent.component import LLM
+from api import settings
 from api.db import CanvasCategory, FileType
 from api.db.services.canvas_service import CanvasTemplateService, UserCanvasService, API4ConversationService
 from api.db.services.document_service import DocumentService
 from api.db.services.file_service import FileService
+from api.db.services.pipeline_operation_log_service import PipelineOperationLogService
+from api.db.services.task_service import queue_dataflow, CANVAS_DEBUG_DOC_ID, TaskService
 from api.db.services.user_service import TenantService
 from api.db.services.user_canvas_version import UserCanvasVersionService
 from api.settings import RetCode
@ -35,10 +39,12 @@ from api.utils import get_uuid
 from api.utils.api_utils import get_json_result, server_error_response, validate_request, get_data_error_result
 from agent.canvas import Canvas
 from peewee import MySQLDatabase, PostgresqlDatabase
-from api.db.db_models import APIToken
+from api.db.db_models import APIToken, Task
 import time

 from api.utils.file_utils import filename_type, read_potential_broken_pdf
+from rag.flow.pipeline import Pipeline
+from rag.nlp import search
 from rag.utils.redis_conn import REDIS_CONN


@ -48,14 +54,6 @@ def templates():
    return get_json_result(data=[c.to_dict() for c in CanvasTemplateService.query(canvas_category=CanvasCategory.Agent)])


-@manager.route('/list', methods=['GET'])  # noqa: F821
-@login_required
-def canvas_list():
-    return get_json_result(data=sorted([c.to_dict() for c in \
-                                 UserCanvasService.query(user_id=current_user.id, canvas_category=CanvasCategory.Agent)], key=lambda x: x["update_time"]*-1)
-                           )
-
-
@manager.route('/rm', methods=['POST'])  # noqa: F821
@validate_request("canvas_ids")
@login_required
@ -77,9 +75,10 @@ def save():
    if not isinstance(req["dsl"], str):
        req["dsl"] = json.dumps(req["dsl"], ensure_ascii=False)
    req["dsl"] = json.loads(req["dsl"])
+    cate = req.get("canvas_category", CanvasCategory.Agent)
    if "id" not in req:
        req["user_id"] = current_user.id
-        if UserCanvasService.query(user_id=current_user.id, title=req["title"].strip(), canvas_category=CanvasCategory.Agent):
+        if UserCanvasService.query(user_id=current_user.id, title=req["title"].strip(), canvas_category=cate):
            return get_data_error_result(message=f"{req['title'].strip()} already exists.")
        req["id"] = get_uuid()
        if not UserCanvasService.save(**req):
@ -148,6 +147,14 @@ def run():
    if not isinstance(cvs.dsl, str):
        cvs.dsl = json.dumps(cvs.dsl, ensure_ascii=False)

+    if cvs.canvas_category == CanvasCategory.DataFlow:
+        task_id = get_uuid()
+        Pipeline(cvs.dsl, tenant_id=current_user.id, doc_id=CANVAS_DEBUG_DOC_ID, task_id=task_id, flow_id=req["id"])
+        ok, error_message = queue_dataflow(tenant_id=user_id, flow_id=req["id"], task_id=task_id, file=files[0], priority=0)
+        if not ok:
+            return get_data_error_result(message=error_message)
+        return get_json_result(data={"message_id": task_id})
+
    try:
        canvas = Canvas(cvs.dsl, current_user.id, req["id"])
    except Exception as e:
@ -173,6 +180,44 @@ def run():
    return resp


+@manager.route('/rerun', methods=['POST'])  # noqa: F821
+@validate_request("id", "dsl", "component_id")
+@login_required
+def rerun():
+    req = request.json
+    doc = PipelineOperationLogService.get_documents_info(req["id"])
+    if not doc:
+        return get_data_error_result(message="Document not found.")
+    doc = doc[0]
+    if 0 < doc["progress"] < 1:
+        return get_data_error_result(message=f"`{doc['name']}` is processing...")
+
+    if settings.docStoreConn.indexExist(search.index_name(current_user.id), doc["kb_id"]):
+        settings.docStoreConn.delete({"doc_id": doc["id"]}, search.index_name(current_user.id), doc["kb_id"])
+    doc["progress_msg"] = ""
+    doc["chunk_num"] = 0
+    doc["token_num"] = 0
+    DocumentService.clear_chunk_num_when_rerun(doc["id"])
+    DocumentService.update_by_id(id, doc)
+    TaskService.filter_delete([Task.doc_id == id])
+
+    dsl = req["dsl"]
+    dsl["path"] = [req["component_id"]]
+    PipelineOperationLogService.update_by_id(req["id"], {"dsl": dsl})
+    queue_dataflow(tenant_id=current_user.id, flow_id=req["id"], task_id=get_uuid(), doc_id=doc["id"], priority=0, rerun=True)
+    return get_json_result(data=True)
+
+
+@manager.route('/cancel/<task_id>', methods=['PUT'])  # noqa: F821
+@login_required
+def cancel(task_id):
+    try:
+        REDIS_CONN.set(f"{task_id}-cancel", "x")
+    except Exception as e:
+        logging.exception(e)
+    return get_json_result(data=True)
+
+
@manager.route('/reset', methods=['POST'])  # noqa: F821
@validate_request("id")
@login_required
@ -399,22 +444,32 @@ def getversion( version_id):
        return get_json_result(data=f"Error getting history file: {e}")


-@manager.route('/listteam', methods=['GET'])  # noqa: F821
+@manager.route('/list', methods=['GET'])  # noqa: F821
@login_required
 def list_canvas():
    keywords = request.args.get("keywords", "")
-    page_number = int(request.args.get("page", 1))
-    items_per_page = int(request.args.get("page_size", 150))
+    page_number = int(request.args.get("page", 0))
+    items_per_page = int(request.args.get("page_size", 0))
    orderby = request.args.get("orderby", "create_time")
-    desc = request.args.get("desc", True)
-    try:
+    canvas_category = request.args.get("canvas_category")
+    if request.args.get("desc", "true").lower() == "false":
+        desc = False
+    else:
+        desc = True
+    owner_ids = [id for id in request.args.get("owner_ids", "").strip().split(",") if id]
+    if not owner_ids:
        tenants = TenantService.get_joined_tenants_by_user_id(current_user.id)
+        tenants = [m["tenant_id"] for m in tenants]
+        tenants.append(current_user.id)
        canvas, total = UserCanvasService.get_by_tenant_ids(
-            [m["tenant_id"] for m in tenants], current_user.id, page_number,
-            items_per_page, orderby, desc, keywords, canvas_category=CanvasCategory.Agent)
-        return get_json_result(data={"canvas": canvas, "total": total})
-    except Exception as e:
-        return server_error_response(e)
+            tenants, current_user.id, page_number,
+            items_per_page, orderby, desc, keywords, canvas_category)
+    else:
+        tenants = owner_ids
+        canvas, total = UserCanvasService.get_by_tenant_ids(
+            tenants, current_user.id, 0,
+            0, orderby, desc, keywords, canvas_category)
+    return get_json_result(data={"canvas": canvas, "total": total})


@manager.route('/setting', methods=['POST'])  # noqa: F821
@ -499,3 +554,11 @@ def prompts():
        #"context_ranking": RANK_MEMORY,
        "citation_guidelines": CITATION_PROMPT_TEMPLATE
    })
+
+
+@manager.route('/download', methods=['GET'])  # noqa: F821
+def download():
+    id = request.args.get("id")
+    created_by = request.args.get("created_by")
+    blob = FileService.get_blob(created_by, id)
+    return flask.make_response(blob)
--- a/api/apps/dataflow_app.py
+++ b/api/apps/dataflow_app.py
@ -1,353 +0,0 @@
-#
-#  Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-#
-import json
-import re
-import sys
-import time
-from functools import partial
-
-import trio
-from flask import request
-from flask_login import current_user, login_required
-
-from agent.canvas import Canvas
-from agent.component.llm import LLM
-from api.db import CanvasCategory, FileType
-from api.db.services.canvas_service import CanvasTemplateService, UserCanvasService
-from api.db.services.document_service import DocumentService
-from api.db.services.file_service import FileService
-from api.db.services.task_service import queue_dataflow
-from api.db.services.user_canvas_version import UserCanvasVersionService
-from api.db.services.user_service import TenantService
-from api.settings import RetCode
-from api.utils import get_uuid
-from api.utils.api_utils import get_data_error_result, get_json_result, server_error_response, validate_request
-from api.utils.file_utils import filename_type, read_potential_broken_pdf
-from rag.flow.pipeline import Pipeline
-
-
-@manager.route("/templates", methods=["GET"])  # noqa: F821
-@login_required
-def templates():
-    return get_json_result(data=[c.to_dict() for c in CanvasTemplateService.query(canvas_category=CanvasCategory.DataFlow)])
-
-
-@manager.route("/list", methods=["GET"])  # noqa: F821
-@login_required
-def canvas_list():
-    return get_json_result(data=sorted([c.to_dict() for c in UserCanvasService.query(user_id=current_user.id, canvas_category=CanvasCategory.DataFlow)], key=lambda x: x["update_time"] * -1))
-
-
-@manager.route("/rm", methods=["POST"])  # noqa: F821
-@validate_request("canvas_ids")
-@login_required
-def rm():
-    for i in request.json["canvas_ids"]:
-        if not UserCanvasService.accessible(i, current_user.id):
-            return get_json_result(data=False, message="Only owner of canvas authorized for this operation.", code=RetCode.OPERATING_ERROR)
-        UserCanvasService.delete_by_id(i)
-    return get_json_result(data=True)
-
-
-@manager.route("/set", methods=["POST"])  # noqa: F821
-@validate_request("dsl", "title")
-@login_required
-def save():
-    req = request.json
-    if not isinstance(req["dsl"], str):
-        req["dsl"] = json.dumps(req["dsl"], ensure_ascii=False)
-    req["dsl"] = json.loads(req["dsl"])
-    req["canvas_category"] = CanvasCategory.DataFlow
-    if "id" not in req:
-        req["user_id"] = current_user.id
-        if UserCanvasService.query(user_id=current_user.id, title=req["title"].strip(), canvas_category=CanvasCategory.DataFlow):
-            return get_data_error_result(message=f"{req['title'].strip()} already exists.")
-        req["id"] = get_uuid()
-
-        if not UserCanvasService.save(**req):
-            return get_data_error_result(message="Fail to save canvas.")
-    else:
-        if not UserCanvasService.accessible(req["id"], current_user.id):
-            return get_json_result(data=False, message="Only owner of canvas authorized for this operation.", code=RetCode.OPERATING_ERROR)
-        UserCanvasService.update_by_id(req["id"], req)
-    # save version
-    UserCanvasVersionService.insert(user_canvas_id=req["id"], dsl=req["dsl"], title="{0}_{1}".format(req["title"], time.strftime("%Y_%m_%d_%H_%M_%S")))
-    UserCanvasVersionService.delete_all_versions(req["id"])
-    return get_json_result(data=req)
-
-
-@manager.route("/get/<canvas_id>", methods=["GET"])  # noqa: F821
-@login_required
-def get(canvas_id):
-    if not UserCanvasService.accessible(canvas_id, current_user.id):
-        return get_data_error_result(message="canvas not found.")
-    e, c = UserCanvasService.get_by_canvas_id(canvas_id)
-    return get_json_result(data=c)
-
-
-@manager.route("/run", methods=["POST"])  # noqa: F821
-@validate_request("id")
-@login_required
-def run():
-    req = request.json
-    flow_id = req.get("id", "")
-    doc_id = req.get("doc_id", "")
-    if not all([flow_id, doc_id]):
-        return get_data_error_result(message="id and doc_id are required.")
-
-    if not DocumentService.get_by_id(doc_id):
-        return get_data_error_result(message=f"Document for {doc_id} not found.")
-
-    user_id = req.get("user_id", current_user.id)
-    if not UserCanvasService.accessible(flow_id, current_user.id):
-        return get_json_result(data=False, message="Only owner of canvas authorized for this operation.", code=RetCode.OPERATING_ERROR)
-
-    e, cvs = UserCanvasService.get_by_id(flow_id)
-    if not e:
-        return get_data_error_result(message="canvas not found.")
-
-    if not isinstance(cvs.dsl, str):
-        cvs.dsl = json.dumps(cvs.dsl, ensure_ascii=False)
-
-    task_id = get_uuid()
-
-    ok, error_message = queue_dataflow(dsl=cvs.dsl, tenant_id=user_id, doc_id=doc_id, task_id=task_id, flow_id=flow_id, priority=0)
-    if not ok:
-        return server_error_response(error_message)
-
-    return get_json_result(data={"task_id": task_id, "flow_id": flow_id})
-
-
-@manager.route("/reset", methods=["POST"])  # noqa: F821
-@validate_request("id")
-@login_required
-def reset():
-    req = request.json
-    flow_id = req.get("id", "")
-    if not flow_id:
-        return get_data_error_result(message="id is required.")
-
-    if not UserCanvasService.accessible(flow_id, current_user.id):
-        return get_json_result(data=False, message="Only owner of canvas authorized for this operation.", code=RetCode.OPERATING_ERROR)
-
-    task_id = req.get("task_id", "")
-
-    try:
-        e, user_canvas = UserCanvasService.get_by_id(req["id"])
-        if not e:
-            return get_data_error_result(message="canvas not found.")
-
-        dataflow = Pipeline(dsl=json.dumps(user_canvas.dsl), tenant_id=current_user.id, flow_id=flow_id, task_id=task_id)
-        dataflow.reset()
-        req["dsl"] = json.loads(str(dataflow))
-        UserCanvasService.update_by_id(req["id"], {"dsl": req["dsl"]})
-        return get_json_result(data=req["dsl"])
-    except Exception as e:
-        return server_error_response(e)
-
-
-@manager.route("/upload/<canvas_id>", methods=["POST"])  # noqa: F821
-def upload(canvas_id):
-    e, cvs = UserCanvasService.get_by_canvas_id(canvas_id)
-    if not e:
-        return get_data_error_result(message="canvas not found.")
-
-    user_id = cvs["user_id"]
-
-    def structured(filename, filetype, blob, content_type):
-        nonlocal user_id
-        if filetype == FileType.PDF.value:
-            blob = read_potential_broken_pdf(blob)
-
-        location = get_uuid()
-        FileService.put_blob(user_id, location, blob)
-
-        return {
-            "id": location,
-            "name": filename,
-            "size": sys.getsizeof(blob),
-            "extension": filename.split(".")[-1].lower(),
-            "mime_type": content_type,
-            "created_by": user_id,
-            "created_at": time.time(),
-            "preview_url": None,
-        }
-
-    if request.args.get("url"):
-        from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CrawlResult, DefaultMarkdownGenerator, PruningContentFilter
-
-        try:
-            url = request.args.get("url")
-            filename = re.sub(r"\?.*", "", url.split("/")[-1])
-
-            async def adownload():
-                browser_config = BrowserConfig(
-                    headless=True,
-                    verbose=False,
-                )
-                async with AsyncWebCrawler(config=browser_config) as crawler:
-                    crawler_config = CrawlerRunConfig(markdown_generator=DefaultMarkdownGenerator(content_filter=PruningContentFilter()), pdf=True, screenshot=False)
-                    result: CrawlResult = await crawler.arun(url=url, config=crawler_config)
-                    return result
-
-            page = trio.run(adownload())
-            if page.pdf:
-                if filename.split(".")[-1].lower() != "pdf":
-                    filename += ".pdf"
-                return get_json_result(data=structured(filename, "pdf", page.pdf, page.response_headers["content-type"]))
-
-            return get_json_result(data=structured(filename, "html", str(page.markdown).encode("utf-8"), page.response_headers["content-type"], user_id))
-
-        except Exception as e:
-            return server_error_response(e)
-
-    file = request.files["file"]
-    try:
-        DocumentService.check_doc_health(user_id, file.filename)
-        return get_json_result(data=structured(file.filename, filename_type(file.filename), file.read(), file.content_type))
-    except Exception as e:
-        return server_error_response(e)
-
-
-@manager.route("/input_form", methods=["GET"])  # noqa: F821
-@login_required
-def input_form():
-    flow_id = request.args.get("id")
-    cpn_id = request.args.get("component_id")
-    try:
-        e, user_canvas = UserCanvasService.get_by_id(flow_id)
-        if not e:
-            return get_data_error_result(message="canvas not found.")
-        if not UserCanvasService.query(user_id=current_user.id, id=flow_id):
-            return get_json_result(data=False, message="Only owner of canvas authorized for this operation.", code=RetCode.OPERATING_ERROR)
-
-        dataflow = Pipeline(dsl=json.dumps(user_canvas.dsl), tenant_id=current_user.id, flow_id=flow_id, task_id="")
-
-        return get_json_result(data=dataflow.get_component_input_form(cpn_id))
-    except Exception as e:
-        return server_error_response(e)
-
-
-@manager.route("/debug", methods=["POST"])  # noqa: F821
-@validate_request("id", "component_id", "params")
-@login_required
-def debug():
-    req = request.json
-    if not UserCanvasService.accessible(req["id"], current_user.id):
-        return get_json_result(data=False, message="Only owner of canvas authorized for this operation.", code=RetCode.OPERATING_ERROR)
-    try:
-        e, user_canvas = UserCanvasService.get_by_id(req["id"])
-        canvas = Canvas(json.dumps(user_canvas.dsl), current_user.id)
-        canvas.reset()
-        canvas.message_id = get_uuid()
-        component = canvas.get_component(req["component_id"])["obj"]
-        component.reset()
-
-        if isinstance(component, LLM):
-            component.set_debug_inputs(req["params"])
-        component.invoke(**{k: o["value"] for k, o in req["params"].items()})
-        outputs = component.output()
-        for k in outputs.keys():
-            if isinstance(outputs[k], partial):
-                txt = ""
-                for c in outputs[k]():
-                    txt += c
-                outputs[k] = txt
-        return get_json_result(data=outputs)
-    except Exception as e:
-        return server_error_response(e)
-
-
-# api get list version dsl of canvas
-@manager.route("/getlistversion/<canvas_id>", methods=["GET"])  # noqa: F821
-@login_required
-def getlistversion(canvas_id):
-    try:
-        list = sorted([c.to_dict() for c in UserCanvasVersionService.list_by_canvas_id(canvas_id)], key=lambda x: x["update_time"] * -1)
-        return get_json_result(data=list)
-    except Exception as e:
-        return get_data_error_result(message=f"Error getting history files: {e}")
-
-
-# api get version dsl of canvas
-@manager.route("/getversion/<version_id>", methods=["GET"])  # noqa: F821
-@login_required
-def getversion(version_id):
-    try:
-        e, version = UserCanvasVersionService.get_by_id(version_id)
-        if version:
-            return get_json_result(data=version.to_dict())
-    except Exception as e:
-        return get_json_result(data=f"Error getting history file: {e}")
-
-
-@manager.route("/listteam", methods=["GET"])  # noqa: F821
-@login_required
-def list_canvas():
-    keywords = request.args.get("keywords", "")
-    page_number = int(request.args.get("page", 1))
-    items_per_page = int(request.args.get("page_size", 150))
-    orderby = request.args.get("orderby", "create_time")
-    desc = request.args.get("desc", True)
-    try:
-        tenants = TenantService.get_joined_tenants_by_user_id(current_user.id)
-        canvas, total = UserCanvasService.get_by_tenant_ids(
-            [m["tenant_id"] for m in tenants], current_user.id, page_number, items_per_page, orderby, desc, keywords, canvas_category=CanvasCategory.DataFlow
-        )
-        return get_json_result(data={"canvas": canvas, "total": total})
-    except Exception as e:
-        return server_error_response(e)
-
-
-@manager.route("/setting", methods=["POST"])  # noqa: F821
-@validate_request("id", "title", "permission")
-@login_required
-def setting():
-    req = request.json
-    req["user_id"] = current_user.id
-
-    if not UserCanvasService.accessible(req["id"], current_user.id):
-        return get_json_result(data=False, message="Only owner of canvas authorized for this operation.", code=RetCode.OPERATING_ERROR)
-
-    e, flow = UserCanvasService.get_by_id(req["id"])
-    if not e:
-        return get_data_error_result(message="canvas not found.")
-    flow = flow.to_dict()
-    flow["title"] = req["title"]
-    for key in ("description", "permission", "avatar"):
-        if value := req.get(key):
-            flow[key] = value
-
-    num = UserCanvasService.update_by_id(req["id"], flow)
-    return get_json_result(data=num)
-
-
-@manager.route("/trace", methods=["GET"])  # noqa: F821
-def trace():
-    dataflow_id = request.args.get("dataflow_id")
-    task_id = request.args.get("task_id")
-    if not all([dataflow_id, task_id]):
-        return get_data_error_result(message="dataflow_id and task_id are required.")
-
-    e, dataflow_canvas = UserCanvasService.get_by_id(dataflow_id)
-    if not e:
-        return get_data_error_result(message="dataflow not found.")
-
-    dsl_str = json.dumps(dataflow_canvas.dsl, ensure_ascii=False)
-    dataflow = Pipeline(dsl=dsl_str, tenant_id=dataflow_canvas.user_id, flow_id=dataflow_id, task_id=task_id)
-    log = dataflow.fetch_logs()
-
-    return get_json_result(data=log)
--- a/api/apps/document_app.py
+++ b/api/apps/document_app.py
@ -33,7 +33,7 @@ from api.db.services.document_service import DocumentService, doc_upload_and_par
 from api.db.services.file2document_service import File2DocumentService
 from api.db.services.file_service import FileService
 from api.db.services.knowledgebase_service import KnowledgebaseService
-from api.db.services.task_service import TaskService, cancel_all_task_of, queue_tasks
+from api.db.services.task_service import TaskService, cancel_all_task_of, queue_tasks, queue_dataflow
 from api.db.services.user_service import UserTenantService
 from api.utils import get_uuid
 from api.utils.api_utils import (
@ -187,6 +187,7 @@ def create():
                "id": get_uuid(),
                "kb_id": kb.id,
                "parser_id": kb.parser_id,
+                "pipeline_id": kb.pipeline_id,
                "parser_config": kb.parser_config,
                "created_by": current_user.id,
                "type": FileType.VIRTUAL,
@ -484,8 +485,11 @@ def run():
                        kb_table_num_map[kb_id] = count
                        if kb_table_num_map[kb_id] <= 0:
                            KnowledgebaseService.delete_field_map(kb_id)
-                bucket, name = File2DocumentService.get_storage_address(doc_id=doc["id"])
-                queue_tasks(doc, bucket, name, 0)
+                if doc.get("pipeline_id", ""):
+                    queue_dataflow(tenant_id, flow_id=doc["pipeline_id"], task_id=get_uuid(), doc_id=id)
+                else:
+                    bucket, name = File2DocumentService.get_storage_address(doc_id=doc["id"])
+                    queue_tasks(doc, bucket, name, 0)

        return get_json_result(data=True)
    except Exception as e:
@ -551,31 +555,22 @@ def get(doc_id):

@manager.route("/change_parser", methods=["POST"])  # noqa: F821
@login_required
-@validate_request("doc_id", "parser_id")
+@validate_request("doc_id")
 def change_parser():
    req = request.json

    if not DocumentService.accessible(req["doc_id"], current_user.id):
        return get_json_result(data=False, message="No authorization.", code=settings.RetCode.AUTHENTICATION_ERROR)
-    try:
-        e, doc = DocumentService.get_by_id(req["doc_id"])
-        if not e:
-            return get_data_error_result(message="Document not found!")
-        if doc.parser_id.lower() == req["parser_id"].lower():
-            if "parser_config" in req:
-                if req["parser_config"] == doc.parser_config:
-                    return get_json_result(data=True)
-            else:
-                return get_json_result(data=True)

-        if (doc.type == FileType.VISUAL and req["parser_id"] != "picture") or (re.search(r"\.(ppt|pptx|pages)$", doc.name) and req["parser_id"] != "presentation"):
-            return get_data_error_result(message="Not supported yet!")
+    e, doc = DocumentService.get_by_id(req["doc_id"])
+    if not e:
+        return get_data_error_result(message="Document not found!")

+    def reset_doc():
+        nonlocal doc
        e = DocumentService.update_by_id(doc.id, {"parser_id": req["parser_id"], "progress": 0, "progress_msg": "", "run": TaskStatus.UNSTART.value})
        if not e:
            return get_data_error_result(message="Document not found!")
-        if "parser_config" in req:
-            DocumentService.update_parser_config(doc.id, req["parser_config"])
        if doc.token_num > 0:
            e = DocumentService.increment_chunk_num(doc.id, doc.kb_id, doc.token_num * -1, doc.chunk_num * -1, doc.process_duration * -1)
            if not e:
@ -586,6 +581,26 @@ def change_parser():
            if settings.docStoreConn.indexExist(search.index_name(tenant_id), doc.kb_id):
                settings.docStoreConn.delete({"doc_id": doc.id}, search.index_name(tenant_id), doc.kb_id)

+    try:
+        if "pipeline_id" in req:
+            if doc.pipeline_id == req["pipeline_id"]:
+                return get_json_result(data=True)
+            DocumentService.update_by_id(doc.id, {"pipeline_id": req["pipeline_id"]})
+            reset_doc()
+            return get_json_result(data=True)
+
+        if doc.parser_id.lower() == req["parser_id"].lower():
+            if "parser_config" in req:
+                if req["parser_config"] == doc.parser_config:
+                    return get_json_result(data=True)
+            else:
+                return get_json_result(data=True)
+
+        if (doc.type == FileType.VISUAL and req["parser_id"] != "picture") or (re.search(r"\.(ppt|pptx|pages)$", doc.name) and req["parser_id"] != "presentation"):
+            return get_data_error_result(message="Not supported yet!")
+        if "parser_config" in req:
+            DocumentService.update_parser_config(doc.id, req["parser_config"])
+        reset_doc()
        return get_json_result(data=True)
    except Exception as e:
        return server_error_response(e)
--- a/api/apps/file_app.py
+++ b/api/apps/file_app.py
@ -179,9 +179,6 @@ def list_files():
        if not e:
            return get_data_error_result(message="Folder not found!")

-        if not check_file_team_permission(file, current_user.id):
-            return get_json_result(data=False, message='No authorization.', code=settings.RetCode.AUTHENTICATION_ERROR)
-
        files, total = FileService.get_by_pf_id(
            current_user.id, pf_id, page_number, items_per_page, orderby, desc, keywords)

@ -213,9 +210,6 @@ def get_parent_folder():
        if not e:
            return get_data_error_result(message="Folder not found!")

-        if not check_file_team_permission(file, current_user.id):
-            return get_json_result(data=False, message='No authorization.', code=settings.RetCode.AUTHENTICATION_ERROR)
-
        parent_folder = FileService.get_parent_folder(file_id)
        return get_json_result(data={"parent_folder": parent_folder.to_json()})
    except Exception as e:
@ -231,9 +225,6 @@ def get_all_parent_folders():
        if not e:
            return get_data_error_result(message="Folder not found!")

-        if not check_file_team_permission(file, current_user.id):
-            return get_json_result(data=False, message='No authorization.', code=settings.RetCode.AUTHENTICATION_ERROR)
-
        parent_folders = FileService.get_all_parent_folders(file_id)
        parent_folders_res = []
        for parent_folder in parent_folders:
--- a/api/apps/kb_app.py
+++ b/api/apps/kb_app.py
@ -14,18 +14,21 @@
 #  limitations under the License.
 #
 import json
+import logging

 from flask import request
 from flask_login import login_required, current_user

 from api.db.services import duplicate_name
-from api.db.services.document_service import DocumentService
+from api.db.services.document_service import DocumentService, queue_raptor_o_graphrag_tasks
 from api.db.services.file2document_service import File2DocumentService
 from api.db.services.file_service import FileService
+from api.db.services.pipeline_operation_log_service import PipelineOperationLogService
+from api.db.services.task_service import TaskService, GRAPH_RAPTOR_FAKE_DOC_ID
 from api.db.services.user_service import TenantService, UserTenantService
-from api.utils.api_utils import server_error_response, get_data_error_result, validate_request, not_allowed_parameters, active_required
+from api.utils.api_utils import get_error_data_result, server_error_response, get_data_error_result, validate_request, not_allowed_parameters
 from api.utils import get_uuid
-from api.db import StatusEnum, FileSource
+from api.db import PipelineTaskType, StatusEnum, FileSource, VALID_FILE_TYPES, VALID_TASK_STATUS
 from api.db.services.knowledgebase_service import KnowledgebaseService
 from api.db.db_models import File
 from api.utils.api_utils import get_json_result
@ -38,7 +41,6 @@ from rag.utils.storage_factory import STORAGE_IMPL

@manager.route('/create', methods=['post'])  # noqa: F821
@login_required
-@active_required
@validate_request("name")
 def create():
    req = request.json
@ -62,10 +64,39 @@ def create():
        req["name"] = dataset_name
        req["tenant_id"] = current_user.id
        req["created_by"] = current_user.id
+        if not req.get("parser_id"):
+            req["parser_id"] = "naive"
        e, t = TenantService.get_by_id(current_user.id)
        if not e:
            return get_data_error_result(message="Tenant not found.")
-        req["embd_id"] = t.embd_id
+        req["parser_config"] = {
+            "layout_recognize": "DeepDOC",
+            "chunk_token_num": 512,
+            "delimiter": "\n",
+            "auto_keywords": 0,
+            "auto_questions": 0,
+            "html4excel": False,
+            "topn_tags": 3,
+            "raptor": {
+                "use_raptor": True,
+                "prompt": "Please summarize the following paragraphs. Be careful with the numbers, do not make things up. Paragraphs as following:\n      {cluster_content}\nThe above is the content you need to summarize.",
+                "max_token": 256,
+                "threshold": 0.1,
+                "max_cluster": 64,
+                "random_seed": 0
+            },
+            "graphrag": {
+                "use_graphrag": True,
+                "entity_types": [
+                    "organization",
+                    "person",
+                    "geo",
+                    "event",
+                    "category"
+                ],
+                "method": "light"
+            }
+        }
        if not KnowledgebaseService.save(**req):
            return get_data_error_result()
        return get_json_result(data={"kb_id": req["id"]})
@ -396,3 +427,352 @@ def get_basic_info():
    basic_info = DocumentService.knowledgebase_basic_info(kb_id)

    return get_json_result(data=basic_info)
+
+
+@manager.route("/list_pipeline_logs", methods=["POST"])  # noqa: F821
+@login_required
+def list_pipeline_logs():
+    kb_id = request.args.get("kb_id")
+    if not kb_id:
+        return get_json_result(data=False, message='Lack of "KB ID"', code=settings.RetCode.ARGUMENT_ERROR)
+
+    keywords = request.args.get("keywords", "")
+
+    page_number = int(request.args.get("page", 0))
+    items_per_page = int(request.args.get("page_size", 0))
+    orderby = request.args.get("orderby", "create_time")
+    if request.args.get("desc", "true").lower() == "false":
+        desc = False
+    else:
+        desc = True
+    create_date_from = request.args.get("create_date_from", "")
+    create_date_to = request.args.get("create_date_to", "")
+    if create_date_to > create_date_from:
+        return get_data_error_result(message="Create data filter is abnormal.")
+
+    req = request.get_json()
+
+    operation_status = req.get("operation_status", [])
+    if operation_status:
+        invalid_status = {s for s in operation_status if s not in VALID_TASK_STATUS}
+        if invalid_status:
+            return get_data_error_result(message=f"Invalid filter operation_status status conditions: {', '.join(invalid_status)}")
+
+    types = req.get("types", [])
+    if types:
+        invalid_types = {t for t in types if t not in VALID_FILE_TYPES}
+        if invalid_types:
+            return get_data_error_result(message=f"Invalid filter conditions: {', '.join(invalid_types)} type{'s' if len(invalid_types) > 1 else ''}")
+
+    suffix = req.get("suffix", [])
+
+    try:
+        logs, tol = PipelineOperationLogService.get_file_logs_by_kb_id(kb_id, page_number, items_per_page, orderby, desc, keywords, operation_status, types, suffix, create_date_from, create_date_to)
+        return get_json_result(data={"total": tol, "logs": logs})
+    except Exception as e:
+        return server_error_response(e)
+
+
+@manager.route("/list_pipeline_dataset_logs", methods=["POST"])  # noqa: F821
+@login_required
+def list_pipeline_dataset_logs():
+    kb_id = request.args.get("kb_id")
+    if not kb_id:
+        return get_json_result(data=False, message='Lack of "KB ID"', code=settings.RetCode.ARGUMENT_ERROR)
+
+    page_number = int(request.args.get("page", 0))
+    items_per_page = int(request.args.get("page_size", 0))
+    orderby = request.args.get("orderby", "create_time")
+    if request.args.get("desc", "true").lower() == "false":
+        desc = False
+    else:
+        desc = True
+    create_date_from = request.args.get("create_date_from", "")
+    create_date_to = request.args.get("create_date_to", "")
+    if create_date_to > create_date_from:
+        return get_data_error_result(message="Create data filter is abnormal.")
+
+    req = request.get_json()
+
+    operation_status = req.get("operation_status", [])
+    if operation_status:
+        invalid_status = {s for s in operation_status if s not in VALID_TASK_STATUS}
+        if invalid_status:
+            return get_data_error_result(message=f"Invalid filter operation_status status conditions: {', '.join(invalid_status)}")
+
+    try:
+        logs, tol = PipelineOperationLogService.get_dataset_logs_by_kb_id(kb_id, page_number, items_per_page, orderby, desc, operation_status, create_date_from, create_date_to)
+        return get_json_result(data={"total": tol, "logs": logs})
+    except Exception as e:
+        return server_error_response(e)
+
+
+@manager.route("/delete_pipeline_logs", methods=["POST"])  # noqa: F821
+@login_required
+def delete_pipeline_logs():
+    kb_id = request.args.get("kb_id")
+    if not kb_id:
+        return get_json_result(data=False, message='Lack of "KB ID"', code=settings.RetCode.ARGUMENT_ERROR)
+
+    req = request.get_json()
+    log_ids = req.get("log_ids", [])
+
+    PipelineOperationLogService.delete_by_ids(log_ids)
+
+    return get_json_result(data=True)
+
+
+@manager.route("/pipeline_log_detail", methods=["GET"])  # noqa: F821
+@login_required
+def pipeline_log_detail():
+    log_id = request.args.get("log_id")
+    if not log_id:
+        return get_json_result(data=False, message='Lack of "Pipeline log ID"', code=settings.RetCode.ARGUMENT_ERROR)
+
+    ok, log = PipelineOperationLogService.get_by_id(log_id)
+    if not ok:
+        return get_data_error_result(message="Invalid pipeline log ID")
+
+    return get_json_result(data=log.to_dict())
+
+
+@manager.route("/run_graphrag", methods=["POST"])  # noqa: F821
+@login_required
+def run_graphrag():
+    req = request.json
+
+    kb_id = req.get("kb_id", "")
+    if not kb_id:
+        return get_error_data_result(message='Lack of "KB ID"')
+
+    ok, kb = KnowledgebaseService.get_by_id(kb_id)
+    if not ok:
+        return get_error_data_result(message="Invalid Knowledgebase ID")
+
+    task_id = kb.graphrag_task_id
+    if task_id:
+        ok, task = TaskService.get_by_id(task_id)
+        if not ok:
+            logging.warning(f"A valid GraphRAG task id is expected for kb {kb_id}")
+
+        if task and task.progress not in [-1, 1]:
+            return get_error_data_result(message=f"Task {task_id} in progress with status {task.progress}. A Graph Task is already running.")
+
+    documents, _ = DocumentService.get_by_kb_id(
+        kb_id=kb_id,
+        page_number=0,
+        items_per_page=0,
+        orderby="create_time",
+        desc=False,
+        keywords="",
+        run_status=[],
+        types=[],
+        suffix=[],
+    )
+    if not documents:
+        return get_error_data_result(message=f"No documents in Knowledgebase {kb_id}")
+
+    sample_document = documents[0]
+    document_ids = [document["id"] for document in documents]
+
+    task_id = queue_raptor_o_graphrag_tasks(doc=sample_document, ty="graphrag", priority=0, fake_doc_id=GRAPH_RAPTOR_FAKE_DOC_ID, doc_ids=list(document_ids))
+
+    if not KnowledgebaseService.update_by_id(kb.id, {"graphrag_task_id": task_id}):
+        logging.warning(f"Cannot save graphrag_task_id for kb {kb_id}")
+
+    return get_json_result(data={"graphrag_task_id": task_id})
+
+
+@manager.route("/trace_graphrag", methods=["GET"])  # noqa: F821
+@login_required
+def trace_graphrag():
+    kb_id = request.args.get("kb_id", "")
+    if not kb_id:
+        return get_error_data_result(message='Lack of "KB ID"')
+
+    ok, kb = KnowledgebaseService.get_by_id(kb_id)
+    if not ok:
+        return get_error_data_result(message="Invalid Knowledgebase ID")
+
+    task_id = kb.graphrag_task_id
+    if not task_id:
+        return get_json_result(data={})
+
+    ok, task = TaskService.get_by_id(task_id)
+    if not ok:
+        return get_error_data_result(message="GraphRAG Task Not Found or Error Occurred")
+
+    return get_json_result(data=task.to_dict())
+
+
+@manager.route("/run_raptor", methods=["POST"])  # noqa: F821
+@login_required
+def run_raptor():
+    req = request.json
+
+    kb_id = req.get("kb_id", "")
+    if not kb_id:
+        return get_error_data_result(message='Lack of "KB ID"')
+
+    ok, kb = KnowledgebaseService.get_by_id(kb_id)
+    if not ok:
+        return get_error_data_result(message="Invalid Knowledgebase ID")
+
+    task_id = kb.raptor_task_id
+    if task_id:
+        ok, task = TaskService.get_by_id(task_id)
+        if not ok:
+            logging.warning(f"A valid RAPTOR task id is expected for kb {kb_id}")
+
+        if task and task.progress not in [-1, 1]:
+            return get_error_data_result(message=f"Task {task_id} in progress with status {task.progress}. A RAPTOR Task is already running.")
+
+    documents, _ = DocumentService.get_by_kb_id(
+        kb_id=kb_id,
+        page_number=0,
+        items_per_page=0,
+        orderby="create_time",
+        desc=False,
+        keywords="",
+        run_status=[],
+        types=[],
+        suffix=[],
+    )
+    if not documents:
+        return get_error_data_result(message=f"No documents in Knowledgebase {kb_id}")
+
+    sample_document = documents[0]
+    document_ids = [document["id"] for document in documents]
+
+    task_id = queue_raptor_o_graphrag_tasks(doc=sample_document, ty="raptor", priority=0, fake_doc_id=GRAPH_RAPTOR_FAKE_DOC_ID, doc_ids=list(document_ids))
+
+    if not KnowledgebaseService.update_by_id(kb.id, {"raptor_task_id": task_id}):
+        logging.warning(f"Cannot save raptor_task_id for kb {kb_id}")
+
+    return get_json_result(data={"raptor_task_id": task_id})
+
+
+@manager.route("/trace_raptor", methods=["GET"])  # noqa: F821
+@login_required
+def trace_raptor():
+    kb_id = request.args.get("kb_id", "")
+    if not kb_id:
+        return get_error_data_result(message='Lack of "KB ID"')
+
+    ok, kb = KnowledgebaseService.get_by_id(kb_id)
+    if not ok:
+        return get_error_data_result(message="Invalid Knowledgebase ID")
+
+    task_id = kb.raptor_task_id
+    if not task_id:
+        return get_json_result(data={})
+
+    ok, task = TaskService.get_by_id(task_id)
+    if not ok:
+        return get_error_data_result(message="RAPTOR Task Not Found or Error Occurred")
+
+    return get_json_result(data=task.to_dict())
+
+
+@manager.route("/run_mindmap", methods=["POST"])  # noqa: F821
+@login_required
+def run_mindmap():
+    req = request.json
+
+    kb_id = req.get("kb_id", "")
+    if not kb_id:
+        return get_error_data_result(message='Lack of "KB ID"')
+
+    ok, kb = KnowledgebaseService.get_by_id(kb_id)
+    if not ok:
+        return get_error_data_result(message="Invalid Knowledgebase ID")
+
+    task_id = kb.mindmap_task_id
+    if task_id:
+        ok, task = TaskService.get_by_id(task_id)
+        if not ok:
+            logging.warning(f"A valid Mindmap task id is expected for kb {kb_id}")
+
+        if task and task.progress not in [-1, 1]:
+            return get_error_data_result(message=f"Task {task_id} in progress with status {task.progress}. A Mindmap Task is already running.")
+
+    documents, _ = DocumentService.get_by_kb_id(
+        kb_id=kb_id,
+        page_number=0,
+        items_per_page=0,
+        orderby="create_time",
+        desc=False,
+        keywords="",
+        run_status=[],
+        types=[],
+        suffix=[],
+    )
+    if not documents:
+        return get_error_data_result(message=f"No documents in Knowledgebase {kb_id}")
+
+    sample_document = documents[0]
+    document_ids = [document["id"] for document in documents]
+
+    task_id = queue_raptor_o_graphrag_tasks(doc=sample_document, ty="mindmap", priority=0, fake_doc_id=GRAPH_RAPTOR_FAKE_DOC_ID, doc_ids=list(document_ids))
+
+    if not KnowledgebaseService.update_by_id(kb.id, {"mindmap_task_id": task_id}):
+        logging.warning(f"Cannot save mindmap_task_id for kb {kb_id}")
+
+    return get_json_result(data={"mindmap_task_id": task_id})
+
+
+@manager.route("/trace_mindmap", methods=["GET"])  # noqa: F821
+@login_required
+def trace_mindmap():
+    kb_id = request.args.get("kb_id", "")
+    if not kb_id:
+        return get_error_data_result(message='Lack of "KB ID"')
+
+    ok, kb = KnowledgebaseService.get_by_id(kb_id)
+    if not ok:
+        return get_error_data_result(message="Invalid Knowledgebase ID")
+
+    task_id = kb.mindmap_task_id
+    if not task_id:
+        return get_json_result(data={})
+
+    ok, task = TaskService.get_by_id(task_id)
+    if not ok:
+        return get_error_data_result(message="Mindmap Task Not Found or Error Occurred")
+
+    return get_json_result(data=task.to_dict())
+
+
+@manager.route("/unbind_task", methods=["DELETE"])  # noqa: F821
+@login_required
+def delete_kb_task():
+    kb_id = request.args.get("kb_id", "")
+    if not kb_id:
+        return get_error_data_result(message='Lack of "KB ID"')
+    ok, kb = KnowledgebaseService.get_by_id(kb_id)
+    if not ok:
+        return get_json_result(data=True)
+
+    pipeline_task_type = request.args.get("pipeline_task_type", "")
+    if not pipeline_task_type or pipeline_task_type not in [PipelineTaskType.GRAPH_RAG, PipelineTaskType.RAPTOR, PipelineTaskType.MINDMAP]:
+        return get_error_data_result(message="Invalid task type")
+
+    match pipeline_task_type:
+        case PipelineTaskType.GRAPH_RAG:
+            settings.docStoreConn.delete({"knowledge_graph_kwd": ["graph", "subgraph", "entity", "relation"]}, search.index_name(kb.tenant_id), kb_id)
+            kb_task_id = "graphrag_task_id"
+            kb_task_finish_at = "graphrag_task_finish_at"
+        case PipelineTaskType.RAPTOR:
+            kb_task_id = "raptor_task_id"
+            kb_task_finish_at = "raptor_task_finish_at"
+        case PipelineTaskType.MINDMAP:
+            kb_task_id = "mindmap_task_id"
+            kb_task_finish_at = "mindmap_task_finish_at"
+        case _:
+            return get_error_data_result(message="Internal Error: Invalid task type")
+
+    ok = KnowledgebaseService.update_by_id(kb_id, {kb_task_id: "", kb_task_finish_at: None})
+    if not ok:
+        return server_error_response(f"Internal error: cannot delete task {pipeline_task_type}")
+
+    return get_json_result(data=True)
--- a/api/db/init.py
+++ b/api/db/init.py
@ -127,4 +127,15 @@ class MCPServerType(StrEnum):
 VALID_MCP_SERVER_TYPES = {MCPServerType.SSE, MCPServerType.STREAMABLE_HTTP}


+class PipelineTaskType(StrEnum):
+    PARSE = "Parse"
+    DOWNLOAD = "Download"
+    RAPTOR = "RAPTOR"
+    GRAPH_RAG = "GraphRAG"
+    MINDMAP = "Mindmap"
+
+
+VALID_PIPELINE_TASK_TYPES = {PipelineTaskType.PARSE, PipelineTaskType.DOWNLOAD, PipelineTaskType.RAPTOR, PipelineTaskType.GRAPH_RAG, PipelineTaskType.MINDMAP}
+
+
 KNOWLEDGEBASE_FOLDER_NAME=".knowledgebase"
--- a/api/db/db_models.py
+++ b/api/db/db_models.py
@ -684,8 +684,17 @@ class Knowledgebase(DataBaseModel):
    vector_similarity_weight = FloatField(default=0.3, index=True)

    parser_id = CharField(max_length=32, null=False, help_text="default parser ID", default=ParserType.NAIVE.value, index=True)
+    pipeline_id = CharField(max_length=32, null=True, help_text="Pipeline ID", index=True)
    parser_config = JSONField(null=False, default={"pages": [[1, 1000000]]})
    pagerank = IntegerField(default=0, index=False)
+
+    graphrag_task_id = CharField(max_length=32, null=True, help_text="Graph RAG task ID", index=True)
+    graphrag_task_finish_at = DateTimeField(null=True)
+    raptor_task_id = CharField(max_length=32, null=True, help_text="RAPTOR task ID", index=True)
+    raptor_task_finish_at = DateTimeField(null=True)
+    mindmap_task_id = CharField(max_length=32, null=True, help_text="Mindmap task ID", index=True)
+    mindmap_task_finish_at = DateTimeField(null=True)
+
    status = CharField(max_length=1, null=True, help_text="is it validate(0: wasted, 1: validate)", default="1", index=True)

    def __str__(self):
@ -700,6 +709,7 @@ class Document(DataBaseModel):
    thumbnail = TextField(null=True, help_text="thumbnail base64 string")
    kb_id = CharField(max_length=256, null=False, index=True)
    parser_id = CharField(max_length=32, null=False, help_text="default parser ID", index=True)
+    pipeline_id = CharField(max_length=32, null=True, help_text="pipleline ID", index=True)
    parser_config = JSONField(null=False, default={"pages": [[1, 1000000]]})
    source_type = CharField(max_length=128, null=False, default="local", help_text="where dose this document come from", index=True)
    type = CharField(max_length=32, null=False, help_text="file extension", index=True)
@ -942,6 +952,32 @@ class Search(DataBaseModel):
        db_table = "search"


+class PipelineOperationLog(DataBaseModel):
+    id = CharField(max_length=32, primary_key=True)
+    document_id = CharField(max_length=32, index=True)
+    tenant_id = CharField(max_length=32, null=False, index=True)
+    kb_id = CharField(max_length=32, null=False, index=True)
+    pipeline_id = CharField(max_length=32, null=True, help_text="Pipeline ID", index=True)
+    pipeline_title = CharField(max_length=32, null=True, help_text="Pipeline title", index=True)
+    parser_id = CharField(max_length=32, null=False, help_text="Parser ID", index=True)
+    document_name = CharField(max_length=255, null=False, help_text="File name")
+    document_suffix = CharField(max_length=255, null=False, help_text="File suffix")
+    document_type = CharField(max_length=255, null=False, help_text="Document type")
+    source_from = CharField(max_length=255, null=False, help_text="Source")
+    progress = FloatField(default=0, index=True)
+    progress_msg = TextField(null=True, help_text="process message", default="")
+    process_begin_at = DateTimeField(null=True, index=True)
+    process_duration = FloatField(default=0)
+    dsl = JSONField(null=True, default=dict)
+    task_type = CharField(max_length=32, null=False, default="")
+    operation_status = CharField(max_length=32, null=False, help_text="Operation status")
+    avatar = TextField(null=True, help_text="avatar base64 string")
+    status = CharField(max_length=1, null=True, help_text="is it validate(0: wasted, 1: validate)", default="1", index=True)
+
+    class Meta:
+        db_table = "pipeline_operation_log"
+
+
 def migrate_db():
    logging.disable(logging.ERROR)
    migrator = DatabaseMigrator[settings.DATABASE_TYPE.upper()].value(DB)
@ -1058,7 +1094,6 @@ def migrate_db():
        migrate(migrator.add_column("dialog", "meta_data_filter", JSONField(null=True, default={})))
    except Exception:
        pass
-
    try:
        migrate(migrator.alter_column_type("canvas_template", "title", JSONField(null=True, default=dict, help_text="Canvas title")))
    except Exception:
@ -1075,4 +1110,36 @@ def migrate_db():
        migrate(migrator.add_column("canvas_template", "canvas_category", CharField(max_length=32, null=False, default="agent_canvas", help_text="agent_canvas|dataflow_canvas", index=True)))
    except Exception:
        pass
+    try:
+        migrate(migrator.add_column("knowledgebase", "pipeline_id", CharField(max_length=32, null=True, help_text="Pipeline ID", index=True)))
+    except Exception:
+        pass
+    try:
+        migrate(migrator.add_column("document", "pipeline_id", CharField(max_length=32, null=True, help_text="Pipeline ID", index=True)))
+    except Exception:
+        pass
+    try:
+        migrate(migrator.add_column("knowledgebase", "graphrag_task_id", CharField(max_length=32, null=True, help_text="Gragh RAG task ID", index=True)))
+    except Exception:
+        pass
+    try:
+        migrate(migrator.add_column("knowledgebase", "raptor_task_id", CharField(max_length=32, null=True, help_text="RAPTOR task ID", index=True)))
+    except Exception:
+        pass
+    try:
+        migrate(migrator.add_column("knowledgebase", "graphrag_task_finish_at", DateTimeField(null=True)))
+    except Exception:
+        pass
+    try:
+        migrate(migrator.add_column("knowledgebase", "raptor_task_finish_at", CharField(null=True)))
+    except Exception:
+        pass
+    try:
+        migrate(migrator.add_column("knowledgebase", "mindmap_task_id", CharField(max_length=32, null=True, help_text="Mindmap task ID", index=True)))
+    except Exception:
+        pass
+    try:
+        migrate(migrator.add_column("knowledgebase", "mindmap_task_finish_at", CharField(null=True)))
+    except Exception:
+        pass
    logging.disable(logging.NOTSET)
--- a/api/db/services/canvas_service.py
+++ b/api/db/services/canvas_service.py
@ -126,7 +126,7 @@ class UserCanvasService(CommonService):
    @DB.connection_context()
    def get_by_tenant_ids(cls, joined_tenant_ids, user_id,
                          page_number, items_per_page,
-                          orderby, desc, keywords, canvas_category=CanvasCategory.Agent,
+                          orderby, desc, keywords, canvas_category=None
                          ):
        fields = [
            cls.model.id,
@ -135,6 +135,7 @@ class UserCanvasService(CommonService):
            cls.model.dsl,
            cls.model.description,
            cls.model.permission,
+            cls.model.user_id.alias("tenant_id"),
            User.nickname,
            User.avatar.alias('tenant_avatar'),
            cls.model.update_time,
@ -142,24 +143,26 @@ class UserCanvasService(CommonService):
        ]
        if keywords:
            agents = cls.model.select(*fields).join(User, on=(cls.model.user_id == User.id)).where(
-                ((cls.model.user_id.in_(joined_tenant_ids) & (cls.model.permission ==
-                                                                TenantPermission.TEAM.value)) | (
-                    cls.model.user_id == user_id)),
-                (fn.LOWER(cls.model.title).contains(keywords.lower()))
+                cls.model.user_id.in_(joined_tenant_ids),
+                fn.LOWER(cls.model.title).contains(keywords.lower())
+                #(((cls.model.user_id.in_(joined_tenant_ids)) & (cls.model.permission == TenantPermission.TEAM.value)) | (cls.model.user_id == user_id)),
+                #(fn.LOWER(cls.model.title).contains(keywords.lower()))
            )
        else:
            agents = cls.model.select(*fields).join(User, on=(cls.model.user_id == User.id)).where(
-                ((cls.model.user_id.in_(joined_tenant_ids) & (cls.model.permission ==
-                                                                TenantPermission.TEAM.value)) | (
-                    cls.model.user_id == user_id))
+                cls.model.user_id.in_(joined_tenant_ids)
+                #(((cls.model.user_id.in_(joined_tenant_ids)) & (cls.model.permission == TenantPermission.TEAM.value)) | (cls.model.user_id == user_id))
            )
-        agents = agents.where(cls.model.canvas_category == canvas_category)
+        if canvas_category:
+            agents = agents.where(cls.model.canvas_category == canvas_category)
        if desc:
            agents = agents.order_by(cls.model.getter_by(orderby).desc())
        else:
            agents = agents.order_by(cls.model.getter_by(orderby).asc())
+
        count = agents.count()
-        agents = agents.paginate(page_number, items_per_page)
+        if page_number and items_per_page:
+            agents = agents.paginate(page_number, items_per_page)
        return list(agents.dicts()), count

    @classmethod
--- a/api/db/services/document_service.py
+++ b/api/db/services/document_service.py
@ -24,12 +24,13 @@ from io import BytesIO

 import trio
 import xxhash
-from peewee import fn, Case
+from peewee import fn, Case, JOIN

 from api import settings
 from api.constants import IMG_BASE64_PREFIX, FILE_NAME_LEN_LIMIT
-from api.db import FileType, LLMType, ParserType, StatusEnum, TaskStatus, UserTenantRole
-from api.db.db_models import DB, Document, Knowledgebase, Task, Tenant, UserTenant, File2Document, File
+from api.db import FileType, LLMType, ParserType, StatusEnum, TaskStatus, UserTenantRole, CanvasCategory
+from api.db.db_models import DB, Document, Knowledgebase, Task, Tenant, UserTenant, File2Document, File, UserCanvas, \
+    User
 from api.db.db_utils import bulk_insert_into_db
 from api.db.services.common_service import CommonService
 from api.db.services.knowledgebase_service import KnowledgebaseService
@ -51,6 +52,7 @@ class DocumentService(CommonService):
            cls.model.thumbnail,
            cls.model.kb_id,
            cls.model.parser_id,
+            cls.model.pipeline_id,
            cls.model.parser_config,
            cls.model.source_type,
            cls.model.type,
@ -79,7 +81,10 @@ class DocumentService(CommonService):
    def get_list(cls, kb_id, page_number, items_per_page,
                 orderby, desc, keywords, id, name):
        fields = cls.get_cls_model_fields()
-        docs = cls.model.select(*fields).join(File2Document, on = (File2Document.document_id == cls.model.id)).join(File, on = (File.id == File2Document.file_id)).where(cls.model.kb_id == kb_id)
+        docs = cls.model.select(*[*fields, UserCanvas.title]).join(File2Document, on = (File2Document.document_id == cls.model.id))\
+            .join(File, on = (File.id == File2Document.file_id))\
+            .join(UserCanvas, on = ((cls.model.pipeline_id == UserCanvas.id) & (UserCanvas.canvas_category == CanvasCategory.DataFlow.value)), join_type=JOIN.LEFT_OUTER)\
+            .where(cls.model.kb_id == kb_id)
        if id:
            docs = docs.where(
                cls.model.id == id)
@ -117,12 +122,22 @@ class DocumentService(CommonService):
                     orderby, desc, keywords, run_status, types, suffix):
        fields = cls.get_cls_model_fields()
        if keywords:
-            docs = cls.model.select(*fields).join(File2Document, on=(File2Document.document_id == cls.model.id)).join(File, on=(File.id == File2Document.file_id)).where(
-                (cls.model.kb_id == kb_id),
-                (fn.LOWER(cls.model.name).contains(keywords.lower()))
-            )
+            docs = cls.model.select(*[*fields, UserCanvas.title.alias("pipeline_name"), User.nickname])\
+                .join(File2Document, on=(File2Document.document_id == cls.model.id))\
+                .join(File, on=(File.id == File2Document.file_id))\
+                .join(UserCanvas, on=(cls.model.pipeline_id == UserCanvas.id), join_type=JOIN.LEFT_OUTER)\
+                .join(User, on=(cls.model.created_by == User.id), join_type=JOIN.LEFT_OUTER)\
+                .where(
+                    (cls.model.kb_id == kb_id),
+                    (fn.LOWER(cls.model.name).contains(keywords.lower()))
+                )
        else:
-            docs = cls.model.select(*fields).join(File2Document, on=(File2Document.document_id == cls.model.id)).join(File, on=(File.id == File2Document.file_id)).where(cls.model.kb_id == kb_id)
+            docs = cls.model.select(*[*fields, UserCanvas.title.alias("pipeline_name"), User.nickname])\
+                .join(File2Document, on=(File2Document.document_id == cls.model.id))\
+                .join(UserCanvas, on=(cls.model.pipeline_id == UserCanvas.id), join_type=JOIN.LEFT_OUTER)\
+                .join(File, on=(File.id == File2Document.file_id))\
+                .join(User, on=(cls.model.created_by == User.id), join_type=JOIN.LEFT_OUTER)\
+                .where(cls.model.kb_id == kb_id)

        if run_status:
            docs = docs.where(cls.model.run.in_(run_status))
@ -370,8 +385,7 @@ class DocumentService(CommonService):
                               process_duration=cls.model.process_duration + duration).where(
            cls.model.id == doc_id).execute()
        if num == 0:
-            raise LookupError(
-                "Document not found which is supposed to be there")
+            logging.warning("Document not found which is supposed to be there")
        num = Knowledgebase.update(
            token_num=Knowledgebase.token_num +
                      token_num,
@ -637,6 +651,22 @@ class DocumentService(CommonService):
    @DB.connection_context()
    def update_progress(cls):
        docs = cls.get_unfinished_docs()
+
+        cls._sync_progress(docs)
+
+
+    @classmethod
+    @DB.connection_context()
+    def update_progress_immediately(cls, docs:list[dict]):
+        if not docs:
+            return
+
+        cls._sync_progress(docs)
+
+
+    @classmethod
+    @DB.connection_context()
+    def _sync_progress(cls, docs:list[dict]):
        for d in docs:
            try:
                tsks = Task.query(doc_id=d["id"], order_by=Task.create_time)
@ -646,8 +676,6 @@ class DocumentService(CommonService):
                prg = 0
                finished = True
                bad = 0
-                has_raptor = False
-                has_graphrag = False
                e, doc = DocumentService.get_by_id(d["id"])
                status = doc.run  # TaskStatus.RUNNING.value
                priority = 0
@ -659,24 +687,14 @@ class DocumentService(CommonService):
                    prg += t.progress if t.progress >= 0 else 0
                    if t.progress_msg.strip():
                        msg.append(t.progress_msg)
-                    if t.task_type == "raptor":
-                        has_raptor = True
-                    elif t.task_type == "graphrag":
-                        has_graphrag = True
                    priority = max(priority, t.priority)
                prg /= len(tsks)
                if finished and bad:
                    prg = -1
                    status = TaskStatus.FAIL.value
                elif finished:
-                    if (d["parser_config"].get("raptor") or {}).get("use_raptor") and not has_raptor:
-                        queue_raptor_o_graphrag_tasks(d, "raptor", priority)
-                        prg = 0.98 * len(tsks) / (len(tsks) + 1)
-                    elif (d["parser_config"].get("graphrag") or {}).get("use_graphrag") and not has_graphrag:
-                        queue_raptor_o_graphrag_tasks(d, "graphrag", priority)
-                        prg = 0.98 * len(tsks) / (len(tsks) + 1)
-                    else:
-                        status = TaskStatus.DONE.value
+                    prg = 1
+                    status = TaskStatus.DONE.value

                msg = "\n".join(sorted(msg))
                info = {
@ -688,7 +706,7 @@ class DocumentService(CommonService):
                    info["progress"] = prg
                if msg:
                    info["progress_msg"] = msg
-                    if msg.endswith("created task graphrag") or msg.endswith("created task raptor"):
+                    if msg.endswith("created task graphrag") or msg.endswith("created task raptor") or msg.endswith("created task mindmap"):
                        info["progress_msg"] += "\n%d tasks are ahead in the queue..."%get_queue_length(priority)
                else:
                    info["progress_msg"] = "%d tasks are ahead in the queue..."%get_queue_length(priority)
@ -769,7 +787,11 @@ class DocumentService(CommonService):
            "cancelled": int(cancelled),
        }

-def queue_raptor_o_graphrag_tasks(doc, ty, priority):
+def queue_raptor_o_graphrag_tasks(doc, ty, priority, fake_doc_id="", doc_ids=[]):
+    """
+    You can provide a fake_doc_id to bypass the restriction of tasks at the knowledgebase level.
+    Optionally, specify a list of doc_ids to determine which documents participate in the task.
+    """
    chunking_config = DocumentService.get_chunking_config(doc["id"])
    hasher = xxhash.xxh64()
    for field in sorted(chunking_config.keys()):
@ -779,11 +801,12 @@ def queue_raptor_o_graphrag_tasks(doc, ty, priority):
        nonlocal doc
        return {
            "id": get_uuid(),
-            "doc_id": doc["id"],
+            "doc_id": fake_doc_id if fake_doc_id else doc["id"],
            "from_page": 100000000,
            "to_page": 100000000,
            "task_type": ty,
-            "progress_msg":  datetime.now().strftime("%H:%M:%S") + " created task " + ty
+            "progress_msg":  datetime.now().strftime("%H:%M:%S") + " created task " + ty,
+            "begin_at": datetime.now(),
        }

    task = new_task()
@ -792,7 +815,12 @@ def queue_raptor_o_graphrag_tasks(doc, ty, priority):
    hasher.update(ty.encode("utf-8"))
    task["digest"] = hasher.hexdigest()
    bulk_insert_into_db(Task, [task], True)
+
+    if ty in ["graphrag", "raptor", "mindmap"]:
+        task["doc_ids"] = doc_ids
+        DocumentService.begin2parse(doc["id"])
    assert REDIS_CONN.queue_product(get_svr_queue_name(priority), message=task), "Can't access Redis. Please check the Redis' status."
+    return task["id"]


 def get_queue_length(priority):
--- a/api/db/services/file_service.py
+++ b/api/db/services/file_service.py
@ -457,6 +457,7 @@ class FileService(CommonService):
                    "id": doc_id,
                    "kb_id": kb.id,
                    "parser_id": self.get_parser(filetype, filename, kb.parser_id),
+                    "pipeline_id": kb.pipeline_id,
                    "parser_config": kb.parser_config,
                    "created_by": user_id,
                    "type": filetype,
@ -512,7 +513,7 @@ class FileService(CommonService):
            return ParserType.AUDIO.value
        if re.search(r"\.(ppt|pptx|pages)$", filename):
            return ParserType.PRESENTATION.value
-        if re.search(r"\.(eml)$", filename):
+        if re.search(r"\.(msg|eml)$", filename):
            return ParserType.EMAIL.value
        return default

--- a/api/db/services/knowledgebase_service.py
+++ b/api/db/services/knowledgebase_service.py
@ -15,10 +15,10 @@
 #
 from datetime import datetime

-from peewee import fn
+from peewee import fn, JOIN

 from api.db import StatusEnum, TenantPermission
-from api.db.db_models import DB, Document, Knowledgebase, Tenant, User, UserTenant
+from api.db.db_models import DB, Document, Knowledgebase, User, UserTenant, UserCanvas
 from api.db.services.common_service import CommonService
 from api.utils import current_timestamp, datetime_format

@ -260,20 +260,29 @@ class KnowledgebaseService(CommonService):
            cls.model.token_num,
            cls.model.chunk_num,
            cls.model.parser_id,
+            cls.model.pipeline_id,
+            UserCanvas.title.alias("pipeline_name"),
+            UserCanvas.avatar.alias("pipeline_avatar"),
            cls.model.parser_config,
            cls.model.pagerank,
+            cls.model.graphrag_task_id,
+            cls.model.graphrag_task_finish_at,
+            cls.model.raptor_task_id,
+            cls.model.raptor_task_finish_at,
+            cls.model.mindmap_task_id,
+            cls.model.mindmap_task_finish_at,
            cls.model.create_time,
            cls.model.update_time
            ]
-        kbs = cls.model.select(*fields).join(Tenant, on=(
-            (Tenant.id == cls.model.tenant_id) & (Tenant.status == StatusEnum.VALID.value))).where(
+        kbs = cls.model.select(*fields)\
+                .join(UserCanvas, on=(cls.model.pipeline_id == UserCanvas.id), join_type=JOIN.LEFT_OUTER)\
+            .where(
            (cls.model.id == kb_id),
            (cls.model.status == StatusEnum.VALID.value)
-        )
+        ).dicts()
        if not kbs:
            return
-        d = kbs[0].to_dict()
-        return d
+        return kbs[0]

    @classmethod
    @DB.connection_context()
--- a/api/db/services/pipeline_operation_log_service.py
+++ b/api/db/services/pipeline_operation_log_service.py
@ -0,0 +1,263 @@
+#
+#  Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import json
+import logging
+import os
+from datetime import datetime, timedelta
+
+from peewee import fn
+
+from api.db import VALID_PIPELINE_TASK_TYPES, PipelineTaskType
+from api.db.db_models import DB, Document, PipelineOperationLog
+from api.db.services.canvas_service import UserCanvasService
+from api.db.services.common_service import CommonService
+from api.db.services.document_service import DocumentService
+from api.db.services.knowledgebase_service import KnowledgebaseService
+from api.db.services.task_service import GRAPH_RAPTOR_FAKE_DOC_ID
+from api.utils import current_timestamp, datetime_format, get_uuid
+
+
+class PipelineOperationLogService(CommonService):
+    model = PipelineOperationLog
+
+    @classmethod
+    def get_file_logs_fields(cls):
+        return [
+            cls.model.id,
+            cls.model.document_id,
+            cls.model.tenant_id,
+            cls.model.kb_id,
+            cls.model.pipeline_id,
+            cls.model.pipeline_title,
+            cls.model.parser_id,
+            cls.model.document_name,
+            cls.model.document_suffix,
+            cls.model.document_type,
+            cls.model.source_from,
+            cls.model.progress,
+            cls.model.progress_msg,
+            cls.model.process_begin_at,
+            cls.model.process_duration,
+            cls.model.dsl,
+            cls.model.task_type,
+            cls.model.operation_status,
+            cls.model.avatar,
+            cls.model.status,
+            cls.model.create_time,
+            cls.model.create_date,
+            cls.model.update_time,
+            cls.model.update_date,
+        ]
+
+    @classmethod
+    def get_dataset_logs_fields(cls):
+        return [
+            cls.model.id,
+            cls.model.tenant_id,
+            cls.model.kb_id,
+            cls.model.progress,
+            cls.model.progress_msg,
+            cls.model.process_begin_at,
+            cls.model.process_duration,
+            cls.model.task_type,
+            cls.model.operation_status,
+            cls.model.avatar,
+            cls.model.status,
+            cls.model.create_time,
+            cls.model.create_date,
+            cls.model.update_time,
+            cls.model.update_date,
+        ]
+
+    @classmethod
+    def save(cls, **kwargs):
+        """
+        wrap this function in a transaction
+        """
+        sample_obj = cls.model(**kwargs).save(force_insert=True)
+        return sample_obj
+
+    @classmethod
+    @DB.connection_context()
+    def create(cls, document_id, pipeline_id, task_type, fake_document_ids=[], dsl: str = "{}"):
+        referred_document_id = document_id
+
+        if referred_document_id == GRAPH_RAPTOR_FAKE_DOC_ID and fake_document_ids:
+            referred_document_id = fake_document_ids[0]
+        ok, document = DocumentService.get_by_id(referred_document_id)
+        if not ok:
+            logging.warning(f"Document for referred_document_id {referred_document_id} not found")
+            return
+        DocumentService.update_progress_immediately([document.to_dict()])
+        ok, document = DocumentService.get_by_id(referred_document_id)
+        if not ok:
+            logging.warning(f"Document for referred_document_id {referred_document_id} not found")
+            return
+        if document.progress not in [1, -1]:
+            return
+        operation_status = document.run
+
+        if pipeline_id:
+            ok, user_pipeline = UserCanvasService.get_by_id(pipeline_id)
+            if not ok:
+                raise RuntimeError(f"Pipeline {pipeline_id} not found")
+            tenant_id = user_pipeline.user_id
+            title = user_pipeline.title
+            avatar = user_pipeline.avatar
+        else:
+            ok, kb_info = KnowledgebaseService.get_by_id(document.kb_id)
+            if not ok:
+                raise RuntimeError(f"Cannot find knowledge base {document.kb_id} for referred_document {referred_document_id}")
+
+            tenant_id = kb_info.tenant_id
+            title = document.parser_id
+            avatar = document.thumbnail
+
+        if task_type not in VALID_PIPELINE_TASK_TYPES:
+            raise ValueError(f"Invalid task type: {task_type}")
+
+        if task_type in [PipelineTaskType.GRAPH_RAG, PipelineTaskType.RAPTOR, PipelineTaskType.MINDMAP]:
+            finish_at = document.process_begin_at + timedelta(seconds=document.process_duration)
+            if task_type == PipelineTaskType.GRAPH_RAG:
+                KnowledgebaseService.update_by_id(
+                    document.kb_id,
+                    {"graphrag_task_finish_at": finish_at},
+                )
+            elif task_type == PipelineTaskType.RAPTOR:
+                KnowledgebaseService.update_by_id(
+                    document.kb_id,
+                    {"raptor_task_finish_at": finish_at},
+                )
+            elif task_type == PipelineTaskType.MINDMAP:
+                KnowledgebaseService.update_by_id(
+                    document.kb_id,
+                    {"mindmap_task_finish_at": finish_at},
+                )
+
+        log = dict(
+            id=get_uuid(),
+            document_id=document_id,  # GRAPH_RAPTOR_FAKE_DOC_ID or real document_id
+            tenant_id=tenant_id,
+            kb_id=document.kb_id,
+            pipeline_id=pipeline_id,
+            pipeline_title=title,
+            parser_id=document.parser_id,
+            document_name=document.name,
+            document_suffix=document.suffix,
+            document_type=document.type,
+            source_from="",  # TODO: add in the future
+            progress=document.progress,
+            progress_msg=document.progress_msg,
+            process_begin_at=document.process_begin_at,
+            process_duration=document.process_duration,
+            dsl=json.loads(dsl),
+            task_type=task_type,
+            operation_status=operation_status,
+            avatar=avatar,
+        )
+        log["create_time"] = current_timestamp()
+        log["create_date"] = datetime_format(datetime.now())
+        log["update_time"] = current_timestamp()
+        log["update_date"] = datetime_format(datetime.now())
+
+        with DB.atomic():
+            obj = cls.save(**log)
+
+            limit = int(os.getenv("PIPELINE_OPERATION_LOG_LIMIT", 1000))
+            total = cls.model.select().where(cls.model.kb_id == document.kb_id).count()
+
+            if total > limit:
+                keep_ids = [m.id for m in cls.model.select(cls.model.id).where(cls.model.kb_id == document.kb_id).order_by(cls.model.create_time.desc()).limit(limit)]
+
+                deleted = cls.model.delete().where(cls.model.kb_id == document.kb_id, cls.model.id.not_in(keep_ids)).execute()
+                logging.info(f"[PipelineOperationLogService] Cleaned {deleted} old logs, kept latest {limit} for {document.kb_id}")
+
+        return obj
+
+    @classmethod
+    @DB.connection_context()
+    def record_pipeline_operation(cls, document_id, pipeline_id, task_type, fake_document_ids=[]):
+        return cls.create(document_id=document_id, pipeline_id=pipeline_id, task_type=task_type, fake_document_ids=fake_document_ids)
+
+    @classmethod
+    @DB.connection_context()
+    def get_file_logs_by_kb_id(cls, kb_id, page_number, items_per_page, orderby, desc, keywords, operation_status, types, suffix, create_date_from=None, create_date_to=None):
+        fields = cls.get_file_logs_fields()
+        if keywords:
+            logs = cls.model.select(*fields).where((cls.model.kb_id == kb_id), (fn.LOWER(cls.model.document_name).contains(keywords.lower())))
+        else:
+            logs = cls.model.select(*fields).where(cls.model.kb_id == kb_id)
+
+        logs = logs.where(cls.model.document_id != GRAPH_RAPTOR_FAKE_DOC_ID)
+
+        if operation_status:
+            logs = logs.where(cls.model.operation_status.in_(operation_status))
+        if types:
+            logs = logs.where(cls.model.document_type.in_(types))
+        if suffix:
+            logs = logs.where(cls.model.document_suffix.in_(suffix))
+        if create_date_from:
+            logs = logs.where(cls.model.create_date >= create_date_from)
+        if create_date_to:
+            logs = logs.where(cls.model.create_date <= create_date_to)
+
+        count = logs.count()
+        if desc:
+            logs = logs.order_by(cls.model.getter_by(orderby).desc())
+        else:
+            logs = logs.order_by(cls.model.getter_by(orderby).asc())
+
+        if page_number and items_per_page:
+            logs = logs.paginate(page_number, items_per_page)
+
+        return list(logs.dicts()), count
+
+    @classmethod
+    @DB.connection_context()
+    def get_documents_info(cls, id):
+        fields = [Document.id, Document.name, Document.progress, Document.kb_id]
+        return (
+            cls.model.select(*fields)
+            .join(Document, on=(cls.model.document_id == Document.id))
+            .where(
+                cls.model.id == id
+            )
+            .dicts()
+        )
+
+    @classmethod
+    @DB.connection_context()
+    def get_dataset_logs_by_kb_id(cls, kb_id, page_number, items_per_page, orderby, desc, operation_status, create_date_from=None, create_date_to=None):
+        fields = cls.get_dataset_logs_fields()
+        logs = cls.model.select(*fields).where((cls.model.kb_id == kb_id), (cls.model.document_id == GRAPH_RAPTOR_FAKE_DOC_ID))
+
+        if operation_status:
+            logs = logs.where(cls.model.operation_status.in_(operation_status))
+        if create_date_from:
+            logs = logs.where(cls.model.create_date >= create_date_from)
+        if create_date_to:
+            logs = logs.where(cls.model.create_date <= create_date_to)
+
+        count = logs.count()
+        if desc:
+            logs = logs.order_by(cls.model.getter_by(orderby).desc())
+        else:
+            logs = logs.order_by(cls.model.getter_by(orderby).asc())
+
+        if page_number and items_per_page:
+            logs = logs.paginate(page_number, items_per_page)
+
+        return list(logs.dicts()), count
--- a/api/db/services/task_service.py
+++ b/api/db/services/task_service.py
@ -35,6 +35,8 @@ from rag.utils.redis_conn import REDIS_CONN
 from api import settings
 from rag.nlp import search

+CANVAS_DEBUG_DOC_ID = "dataflow_x"
+GRAPH_RAPTOR_FAKE_DOC_ID = "graph_raptor_x"

 def trim_header_by_lines(text: str, max_length) -> str:
    # Trim header text to maximum length while preserving line breaks
@ -70,7 +72,7 @@ class TaskService(CommonService):

    @classmethod
    @DB.connection_context()
-    def get_task(cls, task_id):
+    def get_task(cls, task_id, doc_ids=[]):
        """Retrieve detailed task information by task ID.

        This method fetches comprehensive task details including associated document,
@ -84,6 +86,10 @@ class TaskService(CommonService):
            dict: Task details dictionary containing all task information and related metadata.
                 Returns None if task is not found or has exceeded retry limit.
        """
+        doc_id = cls.model.doc_id
+        if doc_id == CANVAS_DEBUG_DOC_ID and doc_ids:
+            doc_id = doc_ids[0]
+
        fields = [
            cls.model.id,
            cls.model.doc_id,
@ -109,7 +115,7 @@ class TaskService(CommonService):
        ]
        docs = (
            cls.model.select(*fields)
-                .join(Document, on=(cls.model.doc_id == Document.id))
+                .join(Document, on=(doc_id == Document.id))
                .join(Knowledgebase, on=(Document.kb_id == Knowledgebase.id))
                .join(Tenant, on=(Knowledgebase.tenant_id == Tenant.id))
                .where(cls.model.id == task_id)
@ -292,21 +298,23 @@ class TaskService(CommonService):
                        ((prog == -1) | (prog > cls.model.progress))
                    )
                ).execute()
-            return
+        else:
+            with DB.lock("update_progress", -1):
+                if info["progress_msg"]:
+                    progress_msg = trim_header_by_lines(task.progress_msg + "\n" + info["progress_msg"], 3000)
+                    cls.model.update(progress_msg=progress_msg).where(cls.model.id == id).execute()
+                if "progress" in info:
+                    prog = info["progress"]
+                    cls.model.update(progress=prog).where(
+                        (cls.model.id == id) &
+                        (
+                            (cls.model.progress != -1) &
+                            ((prog == -1) | (prog > cls.model.progress))
+                        )
+                    ).execute()

-        with DB.lock("update_progress", -1):
-            if info["progress_msg"]:
-                progress_msg = trim_header_by_lines(task.progress_msg + "\n" + info["progress_msg"], 3000)
-                cls.model.update(progress_msg=progress_msg).where(cls.model.id == id).execute()
-            if "progress" in info:
-                prog = info["progress"]
-                cls.model.update(progress=prog).where(
-                    (cls.model.id == id) &
-                    (
-                        (cls.model.progress != -1) &
-                        ((prog == -1) | (prog > cls.model.progress))
-                    )
-                ).execute()
+        process_duration = (datetime.now() - task.begin_at).total_seconds()
+        cls.model.update(process_duration=process_duration).where(cls.model.id == id).execute()

    @classmethod
    @DB.connection_context()
@ -336,7 +344,14 @@ def queue_tasks(doc: dict, bucket: str, name: str, priority: int):
        - Previous task chunks may be reused if available
    """
    def new_task():
-        return {"id": get_uuid(), "doc_id": doc["id"], "progress": 0.0, "from_page": 0, "to_page": 100000000}
+        return {
+            "id": get_uuid(),
+            "doc_id": doc["id"],
+            "progress": 0.0,
+            "from_page": 0,
+            "to_page": 100000000,
+            "begin_at": datetime.now(),
+        }

    parse_task_array = []

@ -349,7 +364,7 @@ def queue_tasks(doc: dict, bucket: str, name: str, priority: int):
        page_size = doc["parser_config"].get("task_page_size") or 12
        if doc["parser_id"] == "paper":
            page_size = doc["parser_config"].get("task_page_size") or 22
-        if doc["parser_id"] in ["one", "knowledge_graph"] or do_layout != "DeepDOC":
+        if doc["parser_id"] in ["one", "knowledge_graph"] or do_layout != "DeepDOC" or doc["parser_config"].get("toc", True):
            page_size = 10 ** 9
        page_ranges = doc["parser_config"].get("pages") or [(1, 10 ** 5)]
        for s, e in page_ranges:
@ -478,33 +493,26 @@ def has_canceled(task_id):
    return False


-def queue_dataflow(dsl:str, tenant_id:str, doc_id:str, task_id:str, flow_id:str, priority: int, callback=None) -> tuple[bool, str]:
-    """
-    Returns a tuple (success: bool, error_message: str).
-    """
-    _ = callback
+def queue_dataflow(tenant_id:str, flow_id:str, task_id:str, doc_id:str=CANVAS_DEBUG_DOC_ID, file:dict=None, priority: int=0, rerun:bool=False) -> tuple[bool, str]:

    task = dict(
-    id=get_uuid() if not task_id else task_id,
-    doc_id=doc_id,
-    from_page=0,
-    to_page=100000000,
-    task_type="dataflow",
-    priority=priority,
+        id=task_id,
+        doc_id=doc_id,
+        from_page=0,
+        to_page=100000000,
+        task_type="dataflow" if not rerun else "dataflow_rerun",
+        priority=priority,
+        begin_at=datetime.now(),
    )
-
-    TaskService.model.delete().where(TaskService.model.id == task["id"]).execute()
+    if doc_id not in [CANVAS_DEBUG_DOC_ID, GRAPH_RAPTOR_FAKE_DOC_ID]:
+        TaskService.model.delete().where(TaskService.model.doc_id == doc_id).execute()
+        DocumentService.begin2parse(doc_id)
    bulk_insert_into_db(model=Task, data_source=[task], replace_on_conflict=True)

-    kb_id = DocumentService.get_knowledgebase_id(doc_id)
-    if not kb_id:
-        return False, f"Can't find KB of this document: {doc_id}"
-
-    task["kb_id"] = kb_id
+    task["kb_id"] = DocumentService.get_knowledgebase_id(doc_id)
    task["tenant_id"] = tenant_id
-    task["task_type"] = "dataflow"
-    task["dsl"] = dsl
-    task["dataflow_id"] = get_uuid() if not flow_id else flow_id
+    task["dataflow_id"] = flow_id
+    task["file"] = file

    if not REDIS_CONN.queue_product(
        get_svr_queue_name(priority), message=task
--- a/api/utils/api_utils.py
+++ b/api/utils/api_utils.py
@ -705,7 +705,9 @@ TimeoutException = Union[Type[BaseException], BaseException]
 OnTimeoutCallback = Union[Callable[..., Any], Coroutine[Any, Any, Any]]


-def timeout(seconds: float | int = None, attempts: int = 2, *, exception: Optional[TimeoutException] = None, on_timeout: Optional[OnTimeoutCallback] = None):
+def timeout(seconds: float | int | str = None, attempts: int = 2, *, exception: Optional[TimeoutException] = None, on_timeout: Optional[OnTimeoutCallback] = None):
+    if isinstance(seconds, str):
+        seconds = float(seconds)
    def decorator(func):
        @wraps(func)
        def wrapper(*args, **kwargs):
--- a/api/utils/base64_image.py
+++ b/api/utils/base64_image.py
@ -1,3 +1,56 @@
 import base64
+import logging
+from functools import partial
+from io import BytesIO
+
+from PIL import Image
+
 test_image_base64 = "iVBORw0KGgoAAAANSUhEUgAAAGQAAABkCAIAAAD/gAIDAAAA6ElEQVR4nO3QwQ3AIBDAsIP9d25XIC+EZE8QZc18w5l9O+AlZgVmBWYFZgVmBWYFZgVmBWYFZgVmBWYFZgVmBWYFZgVmBWYFZgVmBWYFZgVmBWYFZgVmBWYFZgVmBWYFZgVmBWYFZgVmBWYFZgVmBWYFZgVmBWYFZgVmBWYFZgVmBWYFZgVmBWYFZgVmBWYFZgVmBWYFZgVmBWYFZgVmBWYFZgVmBWYFZgVmBWYFZgVmBWYFZgVmBWYFZgVmBWYFZgVmBWYFZgVmBWYFZgVmBWYFZgVmBWYFZgVmBWYFZgVmBWYFZgVmBT+IYAHHLHkdEgAAAABJRU5ErkJggg=="
-test_image = base64.b64decode(test_image_base64)
+test_image = base64.b64decode(test_image_base64)
+
+
+async def image2id(d: dict, storage_put_func: partial, objname:str, bucket:str="imagetemps"):
+    import logging
+    from io import BytesIO
+    import trio
+    from rag.svr.task_executor import minio_limiter
+    if not d.get("image"):
+        return
+
+    with BytesIO() as output_buffer:
+        if isinstance(d["image"], bytes):
+            output_buffer.write(d["image"])
+            output_buffer.seek(0)
+        else:
+            # If the image is in RGBA mode, convert it to RGB mode before saving it in JPEG format.
+            if d["image"].mode in ("RGBA", "P"):
+                converted_image = d["image"].convert("RGB")
+                d["image"] = converted_image
+            try:
+                d["image"].save(output_buffer, format='JPEG')
+            except OSError as e:
+                logging.warning(
+                    "Saving image exception, ignore: {}".format(str(e)))
+
+        async with minio_limiter:
+            await trio.to_thread.run_sync(lambda: storage_put_func(bucket=bucket, fnm=objname, binary=output_buffer.getvalue()))
+        d["img_id"] = f"{bucket}-{objname}"
+        if not isinstance(d["image"], bytes):
+            d["image"].close()
+        del d["image"]  # Remove image reference
+
+
+def id2image(image_id:str|None, storage_get_func: partial):
+    if not image_id:
+        return
+    arr = image_id.split("-")
+    if len(arr) != 2:
+        return
+    bkt, nm = image_id.split("-")
+    try:
+        blob = storage_get_func(bucket=bkt, filename=nm)
+        if not blob:
+            return
+        return Image.open(BytesIO(blob))
+    except Exception as e:
+        logging.exception(e)
--- a/api/utils/file_utils.py
+++ b/api/utils/file_utils.py
@ -155,7 +155,7 @@ def filename_type(filename):
    if re.match(r".*\.pdf$", filename):
        return FileType.PDF.value

-    if re.match(r".*\.(eml|doc|docx|ppt|pptx|yml|xml|htm|json|jsonl|ldjson|csv|txt|ini|xls|xlsx|wps|rtf|hlp|pages|numbers|key|md|py|js|java|c|cpp|h|php|go|ts|sh|cs|kt|html|sql)$", filename):
+    if re.match(r".*\.(msg|eml|doc|docx|ppt|pptx|yml|xml|htm|json|jsonl|ldjson|csv|txt|ini|xls|xlsx|wps|rtf|hlp|pages|numbers|key|md|py|js|java|c|cpp|h|php|go|ts|sh|cs|kt|html|sql)$", filename):
        return FileType.DOC.value

    if re.match(r".*\.(wav|flac|ape|alac|wavpack|wv|mp3|aac|ogg|vorbis|opus)$", filename):
--- a/api/utils/health.py
+++ b/api/utils/health.py
@ -0,0 +1,104 @@
+from timeit import default_timer as timer
+
+from api import settings
+from api.db.db_models import DB
+from rag.utils.redis_conn import REDIS_CONN
+from rag.utils.storage_factory import STORAGE_IMPL
+
+
+def _ok_nok(ok: bool) -> str:
+    return "ok" if ok else "nok"
+
+
+def check_db() -> tuple[bool, dict]:
+    st = timer()
+    try:
+        # lightweight probe; works for MySQL/Postgres
+        DB.execute_sql("SELECT 1")
+        return True, {"elapsed": f"{(timer() - st) * 1000.0:.1f}"}
+    except Exception as e:
+        return False, {"elapsed": f"{(timer() - st) * 1000.0:.1f}", "error": str(e)}
+
+
+def check_redis() -> tuple[bool, dict]:
+    st = timer()
+    try:
+        ok = bool(REDIS_CONN.health())
+        return ok, {"elapsed": f"{(timer() - st) * 1000.0:.1f}"}
+    except Exception as e:
+        return False, {"elapsed": f"{(timer() - st) * 1000.0:.1f}", "error": str(e)}
+
+
+def check_doc_engine() -> tuple[bool, dict]:
+    st = timer()
+    try:
+        meta = settings.docStoreConn.health()
+        # treat any successful call as ok
+        return True, {"elapsed": f"{(timer() - st) * 1000.0:.1f}", **(meta or {})}
+    except Exception as e:
+        return False, {"elapsed": f"{(timer() - st) * 1000.0:.1f}", "error": str(e)}
+
+
+def check_storage() -> tuple[bool, dict]:
+    st = timer()
+    try:
+        STORAGE_IMPL.health()
+        return True, {"elapsed": f"{(timer() - st) * 1000.0:.1f}"}
+    except Exception as e:
+        return False, {"elapsed": f"{(timer() - st) * 1000.0:.1f}", "error": str(e)}
+
+
+def check_chat() -> tuple[bool, dict]:
+    st = timer()
+    try:
+        cfg = getattr(settings, "CHAT_CFG", None)
+        ok = bool(cfg and cfg.get("factory"))
+        return ok, {"elapsed": f"{(timer() - st) * 1000.0:.1f}"}
+    except Exception as e:
+        return False, {"elapsed": f"{(timer() - st) * 1000.0:.1f}", "error": str(e)}
+
+
+def run_health_checks() -> tuple[dict, bool]:
+    result: dict[str, str | dict] = {}
+
+    db_ok, db_meta = check_db()
+    chat_ok, chat_meta = check_chat()
+
+    result["db"] = _ok_nok(db_ok)
+    if not db_ok:
+        result.setdefault("_meta", {})["db"] = db_meta
+
+    result["chat"] = _ok_nok(chat_ok)
+    if not chat_ok:
+        result.setdefault("_meta", {})["chat"] = chat_meta
+
+    # Optional probes (do not change minimal contract but exposed for observability)
+    try:
+        redis_ok, redis_meta = check_redis()
+        result["redis"] = _ok_nok(redis_ok)
+        if not redis_ok:
+            result.setdefault("_meta", {})["redis"] = redis_meta
+    except Exception:
+        result["redis"] = "nok"
+
+    try:
+        doc_ok, doc_meta = check_doc_engine()
+        result["doc_engine"] = _ok_nok(doc_ok)
+        if not doc_ok:
+            result.setdefault("_meta", {})["doc_engine"] = doc_meta
+    except Exception:
+        result["doc_engine"] = "nok"
+
+    try:
+        sto_ok, sto_meta = check_storage()
+        result["storage"] = _ok_nok(sto_ok)
+        if not sto_ok:
+            result.setdefault("_meta", {})["storage"] = sto_meta
+    except Exception:
+        result["storage"] = "nok"
+
+    all_ok = (result.get("db") == "ok") and (result.get("chat") == "ok")
+    result["status"] = "ok" if all_ok else "nok"
+    return result, all_ok
+
+