Refactor: move some functions out of api/utils/__init__.py (#10216 )

### What problem does this PR solve? Refactor import modules. ### Type of change - [x] Refactoring --------- Signed-off-by: jinhai <haijin.chn@gmail.com> Signed-off-by: Jin Hai <haijin.chn@gmail.com>
Fixed the issue where database connections were interrupted under high concurrency (#10126 )
2026-01-04 03:25:30 +08:00 · 2025-09-25 18:04:49 +08:00 · 2025-09-25 17:03:43 +08:00 · 2025-09-25 16:47:56 +08:00 · 2025-09-25 16:15:15 +08:00 · 2025-09-25 14:11:09 +08:00
545 changed files with 30754 additions and 12507 deletions
--- a/admin/admin_client.py
+++ b/admin/admin_client.py
@ -1,13 +1,11 @@
 import argparse
 import base64
-
 from Cryptodome.PublicKey import RSA
 from Cryptodome.Cipher import PKCS1_v1_5 as Cipher_pkcs1_v1_5
 from typing import Dict, List, Any
 from lark import Lark, Transformer, Tree
 import requests
 from requests.auth import HTTPBasicAuth
-from api.common.base64 import encode_to_base64

 GRAMMAR = r"""
 start: command
@ -168,6 +166,11 @@ class AdminTransformer(Transformer):
        return items


+def encode_to_base64(input_string):
+    base64_encoded = base64.b64encode(input_string.encode('utf-8'))
+    return base64_encoded.decode('utf-8')
+
+
 def encrypt(input_string):
    pub = '-----BEGIN PUBLIC KEY-----\nMIIBIjANBgkqhkiG9w0BAQEFAAOCAQ8AMIIBCgKCAQEArq9XTUSeYr2+N1h3Afl/z8Dse/2yD0ZGrKwx+EEEcdsBLca9Ynmx3nIB5obmLlSfmskLpBo0UACBmB5rEjBp2Q2f3AG3Hjd4B+gNCG6BDaawuDlgANIhGnaTLrIqWrrcm4EMzJOnAOI1fgzJRsOOUEfaS318Eq9OVO3apEyCCt0lOQK6PuksduOjVxtltDav+guVAA068NrPYmRNabVKRNLJpL8w4D44sfth5RvZ3q9t+6RTArpEtc5sh5ChzvqPOzKGMXW83C95TxmXqpbK6olN4RevSfVjEAgCydH6HN6OhtOQEcnrU97r9H0iZOWwbw3pVrZiUkuRD1R56Wzs2wIDAQAB\n-----END PUBLIC KEY-----'
    pub_key = RSA.importKey(pub)
@ -429,13 +432,6 @@ class AdminCLI:
        username_tree: Tree = command['username']
        username: str = username_tree.children[0].strip("'\"")
        print(f"Drop user: {username}")
-        url = f'http://{self.host}:{self.port}/api/v1/admin/users/{username}'
-        response = requests.delete(url, auth=HTTPBasicAuth(self.admin_account, self.admin_password))
-        res_json = response.json()
-        if response.status_code == 200:
-            print(res_json["message"])
-        else:
-            print(f"Fail to drop user, code: {res_json['code']}, message: {res_json['message']}")

    def _handle_alter_user(self, command):
        username_tree: Tree = command['username']
@ -444,8 +440,7 @@ class AdminCLI:
        password: str = password_tree.children[0].strip("'\"")
        print(f"Alter user: {username}, password: {password}")
        url = f'http://{self.host}:{self.port}/api/v1/admin/users/{username}/password'
-        response = requests.put(url, auth=HTTPBasicAuth(self.admin_account, self.admin_password),
-                                json={'new_password': encrypt(password)})
+        response = requests.put(url, auth=HTTPBasicAuth(self.admin_account, self.admin_password), json={'new_password': encrypt(password)})
        res_json = response.json()
        if response.status_code == 200:
            print(res_json["message"])
@ -479,8 +474,7 @@ class AdminCLI:
        if activate_status.lower() in ['on', 'off']:
            print(f"Alter user {username} activate status, turn {activate_status.lower()}.")
            url = f'http://{self.host}:{self.port}/api/v1/admin/users/{username}/activate'
-            response = requests.put(url, auth=HTTPBasicAuth(self.admin_account, self.admin_password),
-                                    json={'activate_status': activate_status})
+            response = requests.put(url, auth=HTTPBasicAuth(self.admin_account, self.admin_password), json={'activate_status': activate_status})
            res_json = response.json()
            if response.status_code == 200:
                print(res_json["message"])
@ -538,7 +532,6 @@ Commands:
  DROP USER <user>
  CREATE USER <user> <password>
  ALTER USER PASSWORD <user> <new_password>
-  ALTER USER ACTIVE <user> <on/off>
  LIST DATASETS OF <user>
  LIST AGENTS OF <user>

--- a/admin/config.py
+++ b/admin/config.py
@ -4,7 +4,7 @@ from enum import Enum

 from pydantic import BaseModel
 from typing import Any
-from api.utils.configs import read_config
+from api.utils import read_config
 from urllib.parse import urlparse


--- a/admin/routes.py
+++ b/admin/routes.py
@ -57,11 +57,8 @@ def create_user():
@login_verify
 def delete_user(username):
    try:
-        res = UserMgr.delete_user(username)
-        if res["success"]:
-            return success_response(None, res["message"])
-        else:
-            return error_response(res["message"])
+        UserMgr.delete_user(username)
+        return success_response(None, "User and all data deleted successfully")

    except AdminException as e:
        return error_response(e.message, e.code)
--- a/admin/services.py
+++ b/admin/services.py
@ -2,7 +2,7 @@ import re
 from werkzeug.security import check_password_hash
 from api.db import ActiveEnum
 from api.db.services import UserService
-from api.db.joint_services.user_account_service import create_new_user, delete_user_data
+from api.db.joint_services.user_account_service import create_new_user
 from api.db.services.canvas_service import UserCanvasService
 from api.db.services.user_service import TenantService
 from api.db.services.knowledgebase_service import KnowledgebaseService
@ -61,13 +61,7 @@ class UserMgr:
    @staticmethod
    def delete_user(username):
        # use email to delete
-        user_list = UserService.query_user_by_email(username)
-        if not user_list:
-            raise UserNotFoundError(username)
-        if len(user_list) > 1:
-            raise AdminException(f"Exist more than 1 user: {username}!")
-        usr = user_list[0]
-        return delete_user_data(usr.id)
+        raise AdminException("delete_user: not implemented")

    @staticmethod
    def update_user_password(username, new_password) -> str:
@ -140,13 +134,7 @@ class UserServiceMgr:
        tenants = TenantService.get_joined_tenants_by_user_id(usr.id)
        tenant_ids = [m["tenant_id"] for m in tenants]
        # filter permitted agents and owned agents
-        res = UserCanvasService.get_all_agents_by_tenant_ids(tenant_ids, usr.id)
-        return [{
-            'title': r['title'],
-            'permission': r['permission'],
-            'canvas_type': r['canvas_type'],
-            'canvas_category': r['canvas_category']
-        } for r in res]
+        return UserCanvasService.get_all_agents_by_tenant_ids(tenant_ids, usr.id)

 class ServiceMgr:

--- a/agent/canvas.py
+++ b/agent/canvas.py
@ -153,16 +153,6 @@ class Graph:
    def get_tenant_id(self):
        return self._tenant_id

-    def get_variable_value(self, exp: str) -> Any:
-        exp = exp.strip("{").strip("}").strip(" ").strip("{").strip("}")
-        if exp.find("@") < 0:
-            return self.globals[exp]
-        cpn_id, var_nm = exp.split("@")
-        cpn = self.get_component(cpn_id)
-        if not cpn:
-            raise Exception(f"Can't find variable: '{cpn_id}@{var_nm}'")
-        return cpn["obj"].output(var_nm)
-

 class Canvas(Graph):

@ -416,6 +406,16 @@ class Canvas(Graph):
            return False
        return True

+    def get_variable_value(self, exp: str) -> Any:
+        exp = exp.strip("{").strip("}").strip(" ").strip("{").strip("}")
+        if exp.find("@") < 0:
+            return self.globals[exp]
+        cpn_id, var_nm = exp.split("@")
+        cpn = self.get_component(cpn_id)
+        if not cpn:
+            raise Exception(f"Can't find variable: '{cpn_id}@{var_nm}'")
+        return cpn["obj"].output(var_nm)
+
    def get_history(self, window_size):
        convs = []
        if window_size <= 0:
--- a/agent/component/agent_with_tools.py
+++ b/agent/component/agent_with_tools.py
@ -137,7 +137,7 @@ class Agent(LLM, ToolBase):
            res.update(cpn.get_input_form())
        return res

-    @timeout(int(os.environ.get("COMPONENT_EXEC_TIMEOUT", 20*60)))
+    @timeout(os.environ.get("COMPONENT_EXEC_TIMEOUT", 20*60))
    def _invoke(self, **kwargs):
        if kwargs.get("user_prompt"):
            usr_pmt = ""
--- a/agent/component/base.py
+++ b/agent/component/base.py
@ -431,7 +431,7 @@ class ComponentBase(ABC):
        self.set_output("_elapsed_time", time.perf_counter() - self.output("_created_time"))
        return self.output()

-    @timeout(int(os.environ.get("COMPONENT_EXEC_TIMEOUT", 10*60)))
+    @timeout(os.environ.get("COMPONENT_EXEC_TIMEOUT", 10*60))
    def _invoke(self, **kwargs):
        raise NotImplementedError()

--- a/agent/component/invoke.py
+++ b/agent/component/invoke.py
@ -53,7 +53,7 @@ class InvokeParam(ComponentParamBase):
 class Invoke(ComponentBase, ABC):
    component_name = "Invoke"

-    @timeout(int(os.environ.get("COMPONENT_EXEC_TIMEOUT", 3)))
+    @timeout(os.environ.get("COMPONENT_EXEC_TIMEOUT", 3))
    def _invoke(self, **kwargs):
        args = {}
        for para in self._param.variables:
--- a/agent/component/llm.py
+++ b/agent/component/llm.py
@ -101,8 +101,6 @@ class LLM(ComponentBase):

    def get_input_elements(self) -> dict[str, Any]:
        res = self.get_input_elements_from_text(self._param.sys_prompt)
-        if isinstance(self._param.prompts, str):
-            self._param.prompts = [{"role": "user", "content": self._param.prompts}]
        for prompt in self._param.prompts:
            d = self.get_input_elements_from_text(prompt["content"])
            res.update(d)
@ -114,17 +112,6 @@ class LLM(ComponentBase):
    def add2system_prompt(self, txt):
        self._param.sys_prompt += txt

-    def _sys_prompt_and_msg(self, msg, args):
-        if isinstance(self._param.prompts, str):
-            self._param.prompts = [{"role": "user", "content": self._param.prompts}]
-        for p in self._param.prompts:
-            if msg and msg[-1]["role"] == p["role"]:
-                continue
-            p = deepcopy(p)
-            p["content"] = self.string_format(p["content"], args)
-            msg.append(p)
-        return msg, self.string_format(self._param.sys_prompt, args)
-
    def _prepare_prompt_variables(self):
        if self._param.visual_files_var:
            self.imgs = self._canvas.get_variable_value(self._param.visual_files_var)
@ -140,6 +127,7 @@ class LLM(ComponentBase):

        args = {}
        vars = self.get_input_elements() if not self._param.debug_inputs else self._param.debug_inputs
+        sys_prompt = self._param.sys_prompt
        for k, o in vars.items():
            args[k] = o["value"]
            if not isinstance(args[k], str):
@ -149,8 +137,16 @@ class LLM(ComponentBase):
                    args[k] = str(args[k])
            self.set_input_value(k, args[k])

-        msg, sys_prompt = self._sys_prompt_and_msg(self._canvas.get_history(self._param.message_history_window_size)[:-1], args)
+        msg = self._canvas.get_history(self._param.message_history_window_size)[:-1]
+        for p in self._param.prompts:
+            if msg and msg[-1]["role"] == p["role"]:
+                continue
+            msg.append(deepcopy(p))
+
+        sys_prompt = self.string_format(sys_prompt, args)
        user_defined_prompt, sys_prompt = self._extract_prompts(sys_prompt)
+        for m in msg:
+            m["content"] = self.string_format(m["content"], args)
        if self._param.cite and self._canvas.get_reference()["chunks"]:
            sys_prompt += citation_prompt(user_defined_prompt)

@ -205,7 +201,7 @@ class LLM(ComponentBase):
            for txt in self.chat_mdl.chat_streamly(msg[0]["content"], msg[1:], self._param.gen_conf(), images=self.imgs, **kwargs):
                yield delta(txt)

-    @timeout(int(os.environ.get("COMPONENT_EXEC_TIMEOUT", 10*60)))
+    @timeout(os.environ.get("COMPONENT_EXEC_TIMEOUT", 10*60))
    def _invoke(self, **kwargs):
        def clean_formated_answer(ans: str) -> str:
            ans = re.sub(r"^.*</think>", "", ans, flags=re.DOTALL)
--- a/agent/component/message.py
+++ b/agent/component/message.py
@ -127,7 +127,7 @@ class Message(ComponentBase):
        ]
        return any([re.search(p, content) for p in patt])

-    @timeout(int(os.environ.get("COMPONENT_EXEC_TIMEOUT", 10*60)))
+    @timeout(os.environ.get("COMPONENT_EXEC_TIMEOUT", 10*60))
    def _invoke(self, **kwargs):
        rand_cnt = random.choice(self._param.content)
        if self._param.stream and not self._is_jinjia2(rand_cnt):
--- a/agent/component/switch.py
+++ b/agent/component/switch.py
@ -61,7 +61,7 @@ class SwitchParam(ComponentParamBase):
 class Switch(ComponentBase, ABC):
    component_name = "Switch"

-    @timeout(int(os.environ.get("COMPONENT_EXEC_TIMEOUT", 3)))
+    @timeout(os.environ.get("COMPONENT_EXEC_TIMEOUT", 3))
    def _invoke(self, **kwargs):
        for cond in self._param.conditions:
            res = []
--- a/agent/tools/code_exec.py
+++ b/agent/tools/code_exec.py
@ -157,7 +157,7 @@ class CodeExec(ToolBase, ABC):

        try:
            resp = requests.post(url=f"http://{settings.SANDBOX_HOST}:9385/run", json=code_req, timeout=os.environ.get("COMPONENT_EXEC_TIMEOUT", 10*60))
-            logging.info(f"http://{settings.SANDBOX_HOST}:9385/run", code_req, resp.status_code)
+            logging.info(f"http://{settings.SANDBOX_HOST}:9385/run,  code_req: {code_req}, resp.status_code {resp.status_code}:")
            if resp.status_code != 200:
                resp.raise_for_status()
            body = resp.json()
--- a/agent/tools/exesql.py
+++ b/agent/tools/exesql.py
@ -53,7 +53,7 @@ class ExeSQLParam(ToolParamBase):
        self.max_records = 1024

    def check(self):
-        self.check_valid_value(self.db_type, "Choose DB type", ['mysql', 'postgres', 'mariadb', 'mssql', 'IBM DB2'])
+        self.check_valid_value(self.db_type, "Choose DB type", ['mysql', 'postgres', 'mariadb', 'mssql'])
        self.check_empty(self.database, "Database name")
        self.check_empty(self.username, "database username")
        self.check_empty(self.host, "IP Address")
@ -123,55 +123,6 @@ class ExeSQL(ToolBase, ABC):
                    r'PWD=' + self._param.password
            )
            db = pyodbc.connect(conn_str)
-        elif self._param.db_type == 'IBM DB2':
-            import ibm_db
-            conn_str = (
-                f"DATABASE={self._param.database};"
-                f"HOSTNAME={self._param.host};"
-                f"PORT={self._param.port};"
-                f"PROTOCOL=TCPIP;"
-                f"UID={self._param.username};"
-                f"PWD={self._param.password};"
-            )
-            try:
-                conn = ibm_db.connect(conn_str, "", "")
-            except Exception as e:
-                raise Exception("Database Connection Failed! \n" + str(e))
-
-            sql_res = []
-            formalized_content = []
-            for single_sql in sqls:
-                single_sql = single_sql.replace("```", "").strip()
-                if not single_sql:
-                    continue
-                single_sql = re.sub(r"\[ID:[0-9]+\]", "", single_sql)
-
-                stmt = ibm_db.exec_immediate(conn, single_sql)
-                rows = []
-                row = ibm_db.fetch_assoc(stmt)
-                while row and len(rows) < self._param.max_records:
-                    rows.append(row)
-                    row = ibm_db.fetch_assoc(stmt)
-
-                if not rows:
-                    sql_res.append({"content": "No record in the database!"})
-                    continue
-
-                df = pd.DataFrame(rows)
-                for col in df.columns:
-                    if pd.api.types.is_datetime64_any_dtype(df[col]):
-                        df[col] = df[col].dt.strftime("%Y-%m-%d")
-
-                df = df.where(pd.notnull(df), None)
-
-                sql_res.append(convert_decimals(df.to_dict(orient="records")))
-                formalized_content.append(df.to_markdown(index=False, floatfmt=".6f"))
-
-            ibm_db.close(conn)
-
-            self.set_output("json", sql_res)
-            self.set_output("formalized_content", "\n\n".join(formalized_content))
-            return self.output("formalized_content")
        try:
            cursor = db.cursor()
        except Exception as e:
@ -199,8 +150,6 @@ class ExeSQL(ToolBase, ABC):
                if pd.api.types.is_datetime64_any_dtype(single_res[col]):
                    single_res[col] = single_res[col].dt.strftime('%Y-%m-%d')

-            single_res = single_res.where(pd.notnull(single_res), None)
-
            sql_res.append(convert_decimals(single_res.to_dict(orient='records')))
            formalized_content.append(single_res.to_markdown(index=False, floatfmt=".6f"))

--- a/api/apps/canvas_app.py
+++ b/api/apps/canvas_app.py
@ -19,19 +19,15 @@ import re
 import sys
 from functools import partial

-import flask
 import trio
 from flask import request, Response
 from flask_login import login_required, current_user

-from agent.component import LLM
-from api import settings
+from agent.component.llm import LLM
 from api.db import CanvasCategory, FileType
 from api.db.services.canvas_service import CanvasTemplateService, UserCanvasService, API4ConversationService
 from api.db.services.document_service import DocumentService
 from api.db.services.file_service import FileService
-from api.db.services.pipeline_operation_log_service import PipelineOperationLogService
-from api.db.services.task_service import queue_dataflow, CANVAS_DEBUG_DOC_ID, TaskService
 from api.db.services.user_service import TenantService
 from api.db.services.user_canvas_version import UserCanvasVersionService
 from api.settings import RetCode
@ -39,12 +35,10 @@ from api.utils import get_uuid
 from api.utils.api_utils import get_json_result, server_error_response, validate_request, get_data_error_result
 from agent.canvas import Canvas
 from peewee import MySQLDatabase, PostgresqlDatabase
-from api.db.db_models import APIToken, Task
+from api.db.db_models import APIToken
 import time

 from api.utils.file_utils import filename_type, read_potential_broken_pdf
-from rag.flow.pipeline import Pipeline
-from rag.nlp import search
 from rag.utils.redis_conn import REDIS_CONN


@ -54,6 +48,14 @@ def templates():
    return get_json_result(data=[c.to_dict() for c in CanvasTemplateService.query(canvas_category=CanvasCategory.Agent)])


+@manager.route('/list', methods=['GET'])  # noqa: F821
+@login_required
+def canvas_list():
+    return get_json_result(data=sorted([c.to_dict() for c in \
+                                 UserCanvasService.query(user_id=current_user.id, canvas_category=CanvasCategory.Agent)], key=lambda x: x["update_time"]*-1)
+                           )
+
+
@manager.route('/rm', methods=['POST'])  # noqa: F821
@validate_request("canvas_ids")
@login_required
@ -75,10 +77,9 @@ def save():
    if not isinstance(req["dsl"], str):
        req["dsl"] = json.dumps(req["dsl"], ensure_ascii=False)
    req["dsl"] = json.loads(req["dsl"])
-    cate = req.get("canvas_category", CanvasCategory.Agent)
    if "id" not in req:
        req["user_id"] = current_user.id
-        if UserCanvasService.query(user_id=current_user.id, title=req["title"].strip(), canvas_category=cate):
+        if UserCanvasService.query(user_id=current_user.id, title=req["title"].strip(), canvas_category=CanvasCategory.Agent):
            return get_data_error_result(message=f"{req['title'].strip()} already exists.")
        req["id"] = get_uuid()
        if not UserCanvasService.save(**req):
@ -100,7 +101,7 @@ def save():
 def get(canvas_id):
    if not UserCanvasService.accessible(canvas_id, current_user.id):
        return get_data_error_result(message="canvas not found.")
-    e, c = UserCanvasService.get_by_canvas_id(canvas_id)
+    e, c = UserCanvasService.get_by_tenant_id(canvas_id)
    return get_json_result(data=c)


@ -147,14 +148,6 @@ def run():
    if not isinstance(cvs.dsl, str):
        cvs.dsl = json.dumps(cvs.dsl, ensure_ascii=False)

-    if cvs.canvas_category == CanvasCategory.DataFlow:
-        task_id = get_uuid()
-        Pipeline(cvs.dsl, tenant_id=current_user.id, doc_id=CANVAS_DEBUG_DOC_ID, task_id=task_id, flow_id=req["id"])
-        ok, error_message = queue_dataflow(tenant_id=user_id, flow_id=req["id"], task_id=task_id, file=files[0], priority=0)
-        if not ok:
-            return get_data_error_result(message=error_message)
-        return get_json_result(data={"message_id": task_id})
-
    try:
        canvas = Canvas(cvs.dsl, current_user.id, req["id"])
    except Exception as e:
@ -180,44 +173,6 @@ def run():
    return resp


-@manager.route('/rerun', methods=['POST'])  # noqa: F821
-@validate_request("id", "dsl", "component_id")
-@login_required
-def rerun():
-    req = request.json
-    doc = PipelineOperationLogService.get_documents_info(req["id"])
-    if not doc:
-        return get_data_error_result(message="Document not found.")
-    doc = doc[0]
-    if 0 < doc["progress"] < 1:
-        return get_data_error_result(message=f"`{doc['name']}` is processing...")
-
-    if settings.docStoreConn.indexExist(search.index_name(current_user.id), doc["kb_id"]):
-        settings.docStoreConn.delete({"doc_id": doc["id"]}, search.index_name(current_user.id), doc["kb_id"])
-    doc["progress_msg"] = ""
-    doc["chunk_num"] = 0
-    doc["token_num"] = 0
-    DocumentService.clear_chunk_num_when_rerun(doc["id"])
-    DocumentService.update_by_id(id, doc)
-    TaskService.filter_delete([Task.doc_id == id])
-
-    dsl = req["dsl"]
-    dsl["path"] = [req["component_id"]]
-    PipelineOperationLogService.update_by_id(req["id"], {"dsl": dsl})
-    queue_dataflow(tenant_id=current_user.id, flow_id=req["id"], task_id=get_uuid(), doc_id=doc["id"], priority=0, rerun=True)
-    return get_json_result(data=True)
-
-
-@manager.route('/cancel/<task_id>', methods=['PUT'])  # noqa: F821
-@login_required
-def cancel(task_id):
-    try:
-        REDIS_CONN.set(f"{task_id}-cancel", "x")
-    except Exception as e:
-        logging.exception(e)
-    return get_json_result(data=True)
-
-
@manager.route('/reset', methods=['POST'])  # noqa: F821
@validate_request("id")
@login_required
@ -243,7 +198,7 @@ def reset():

@manager.route("/upload/<canvas_id>", methods=["POST"])  # noqa: F821
 def upload(canvas_id):
-    e, cvs = UserCanvasService.get_by_canvas_id(canvas_id)
+    e, cvs = UserCanvasService.get_by_tenant_id(canvas_id)
    if not e:
        return get_data_error_result(message="canvas not found.")

@ -393,22 +348,6 @@ def test_db_connect():
            cursor = db.cursor()
            cursor.execute("SELECT 1")
            cursor.close()
-        elif req["db_type"] == 'IBM DB2':
-            import ibm_db
-            conn_str = (
-                f"DATABASE={req['database']};"
-                f"HOSTNAME={req['host']};"
-                f"PORT={req['port']};"
-                f"PROTOCOL=TCPIP;"
-                f"UID={req['username']};"
-                f"PWD={req['password']};"
-            )
-            logging.info(conn_str)
-            conn = ibm_db.connect(conn_str, "", "")
-            stmt = ibm_db.exec_immediate(conn, "SELECT 1 FROM sysibm.sysdummy1")
-            ibm_db.fetch_assoc(stmt)
-            ibm_db.close(conn)
-            return get_json_result(data="Database Connection Successful!")
        else:
            return server_error_response("Unsupported database type.")
        if req["db_type"] != 'mssql':
@ -444,32 +383,22 @@ def getversion( version_id):
        return get_json_result(data=f"Error getting history file: {e}")


-@manager.route('/list', methods=['GET'])  # noqa: F821
+@manager.route('/listteam', methods=['GET'])  # noqa: F821
@login_required
 def list_canvas():
    keywords = request.args.get("keywords", "")
-    page_number = int(request.args.get("page", 0))
-    items_per_page = int(request.args.get("page_size", 0))
+    page_number = int(request.args.get("page", 1))
+    items_per_page = int(request.args.get("page_size", 150))
    orderby = request.args.get("orderby", "create_time")
-    canvas_category = request.args.get("canvas_category")
-    if request.args.get("desc", "true").lower() == "false":
-        desc = False
-    else:
-        desc = True
-    owner_ids = [id for id in request.args.get("owner_ids", "").strip().split(",") if id]
-    if not owner_ids:
+    desc = request.args.get("desc", True)
+    try:
        tenants = TenantService.get_joined_tenants_by_user_id(current_user.id)
-        tenants = [m["tenant_id"] for m in tenants]
-        tenants.append(current_user.id)
        canvas, total = UserCanvasService.get_by_tenant_ids(
-            tenants, current_user.id, page_number,
-            items_per_page, orderby, desc, keywords, canvas_category)
-    else:
-        tenants = owner_ids
-        canvas, total = UserCanvasService.get_by_tenant_ids(
-            tenants, current_user.id, 0,
-            0, orderby, desc, keywords, canvas_category)
-    return get_json_result(data={"canvas": canvas, "total": total})
+            [m["tenant_id"] for m in tenants], current_user.id, page_number,
+            items_per_page, orderby, desc, keywords, canvas_category=CanvasCategory.Agent)
+        return get_json_result(data={"canvas": canvas, "total": total})
+    except Exception as e:
+        return server_error_response(e)


@manager.route('/setting', methods=['POST'])  # noqa: F821
@ -554,11 +483,3 @@ def prompts():
        #"context_ranking": RANK_MEMORY,
        "citation_guidelines": CITATION_PROMPT_TEMPLATE
    })
-
-
-@manager.route('/download', methods=['GET'])  # noqa: F821
-def download():
-    id = request.args.get("id")
-    created_by = request.args.get("created_by")
-    blob = FileService.get_blob(created_by, id)
-    return flask.make_response(blob)
--- a/api/apps/dataflow_app.py
+++ b/api/apps/dataflow_app.py
@ -0,0 +1,353 @@
+#
+#  Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import json
+import re
+import sys
+import time
+from functools import partial
+
+import trio
+from flask import request
+from flask_login import current_user, login_required
+
+from agent.canvas import Canvas
+from agent.component.llm import LLM
+from api.db import CanvasCategory, FileType
+from api.db.services.canvas_service import CanvasTemplateService, UserCanvasService
+from api.db.services.document_service import DocumentService
+from api.db.services.file_service import FileService
+from api.db.services.task_service import queue_dataflow
+from api.db.services.user_canvas_version import UserCanvasVersionService
+from api.db.services.user_service import TenantService
+from api.settings import RetCode
+from api.utils import get_uuid
+from api.utils.api_utils import get_data_error_result, get_json_result, server_error_response, validate_request
+from api.utils.file_utils import filename_type, read_potential_broken_pdf
+from rag.flow.pipeline import Pipeline
+
+
+@manager.route("/templates", methods=["GET"])  # noqa: F821
+@login_required
+def templates():
+    return get_json_result(data=[c.to_dict() for c in CanvasTemplateService.query(canvas_category=CanvasCategory.DataFlow)])
+
+
+@manager.route("/list", methods=["GET"])  # noqa: F821
+@login_required
+def canvas_list():
+    return get_json_result(data=sorted([c.to_dict() for c in UserCanvasService.query(user_id=current_user.id, canvas_category=CanvasCategory.DataFlow)], key=lambda x: x["update_time"] * -1))
+
+
+@manager.route("/rm", methods=["POST"])  # noqa: F821
+@validate_request("canvas_ids")
+@login_required
+def rm():
+    for i in request.json["canvas_ids"]:
+        if not UserCanvasService.accessible(i, current_user.id):
+            return get_json_result(data=False, message="Only owner of canvas authorized for this operation.", code=RetCode.OPERATING_ERROR)
+        UserCanvasService.delete_by_id(i)
+    return get_json_result(data=True)
+
+
+@manager.route("/set", methods=["POST"])  # noqa: F821
+@validate_request("dsl", "title")
+@login_required
+def save():
+    req = request.json
+    if not isinstance(req["dsl"], str):
+        req["dsl"] = json.dumps(req["dsl"], ensure_ascii=False)
+    req["dsl"] = json.loads(req["dsl"])
+    req["canvas_category"] = CanvasCategory.DataFlow
+    if "id" not in req:
+        req["user_id"] = current_user.id
+        if UserCanvasService.query(user_id=current_user.id, title=req["title"].strip(), canvas_category=CanvasCategory.DataFlow):
+            return get_data_error_result(message=f"{req['title'].strip()} already exists.")
+        req["id"] = get_uuid()
+
+        if not UserCanvasService.save(**req):
+            return get_data_error_result(message="Fail to save canvas.")
+    else:
+        if not UserCanvasService.accessible(req["id"], current_user.id):
+            return get_json_result(data=False, message="Only owner of canvas authorized for this operation.", code=RetCode.OPERATING_ERROR)
+        UserCanvasService.update_by_id(req["id"], req)
+    # save version
+    UserCanvasVersionService.insert(user_canvas_id=req["id"], dsl=req["dsl"], title="{0}_{1}".format(req["title"], time.strftime("%Y_%m_%d_%H_%M_%S")))
+    UserCanvasVersionService.delete_all_versions(req["id"])
+    return get_json_result(data=req)
+
+
+@manager.route("/get/<canvas_id>", methods=["GET"])  # noqa: F821
+@login_required
+def get(canvas_id):
+    if not UserCanvasService.accessible(canvas_id, current_user.id):
+        return get_data_error_result(message="canvas not found.")
+    e, c = UserCanvasService.get_by_tenant_id(canvas_id)
+    return get_json_result(data=c)
+
+
+@manager.route("/run", methods=["POST"])  # noqa: F821
+@validate_request("id")
+@login_required
+def run():
+    req = request.json
+    flow_id = req.get("id", "")
+    doc_id = req.get("doc_id", "")
+    if not all([flow_id, doc_id]):
+        return get_data_error_result(message="id and doc_id are required.")
+
+    if not DocumentService.get_by_id(doc_id):
+        return get_data_error_result(message=f"Document for {doc_id} not found.")
+
+    user_id = req.get("user_id", current_user.id)
+    if not UserCanvasService.accessible(flow_id, current_user.id):
+        return get_json_result(data=False, message="Only owner of canvas authorized for this operation.", code=RetCode.OPERATING_ERROR)
+
+    e, cvs = UserCanvasService.get_by_id(flow_id)
+    if not e:
+        return get_data_error_result(message="canvas not found.")
+
+    if not isinstance(cvs.dsl, str):
+        cvs.dsl = json.dumps(cvs.dsl, ensure_ascii=False)
+
+    task_id = get_uuid()
+
+    ok, error_message = queue_dataflow(dsl=cvs.dsl, tenant_id=user_id, doc_id=doc_id, task_id=task_id, flow_id=flow_id, priority=0)
+    if not ok:
+        return server_error_response(error_message)
+
+    return get_json_result(data={"task_id": task_id, "flow_id": flow_id})
+
+
+@manager.route("/reset", methods=["POST"])  # noqa: F821
+@validate_request("id")
+@login_required
+def reset():
+    req = request.json
+    flow_id = req.get("id", "")
+    if not flow_id:
+        return get_data_error_result(message="id is required.")
+
+    if not UserCanvasService.accessible(flow_id, current_user.id):
+        return get_json_result(data=False, message="Only owner of canvas authorized for this operation.", code=RetCode.OPERATING_ERROR)
+
+    task_id = req.get("task_id", "")
+
+    try:
+        e, user_canvas = UserCanvasService.get_by_id(req["id"])
+        if not e:
+            return get_data_error_result(message="canvas not found.")
+
+        dataflow = Pipeline(dsl=json.dumps(user_canvas.dsl), tenant_id=current_user.id, flow_id=flow_id, task_id=task_id)
+        dataflow.reset()
+        req["dsl"] = json.loads(str(dataflow))
+        UserCanvasService.update_by_id(req["id"], {"dsl": req["dsl"]})
+        return get_json_result(data=req["dsl"])
+    except Exception as e:
+        return server_error_response(e)
+
+
+@manager.route("/upload/<canvas_id>", methods=["POST"])  # noqa: F821
+def upload(canvas_id):
+    e, cvs = UserCanvasService.get_by_tenant_id(canvas_id)
+    if not e:
+        return get_data_error_result(message="canvas not found.")
+
+    user_id = cvs["user_id"]
+
+    def structured(filename, filetype, blob, content_type):
+        nonlocal user_id
+        if filetype == FileType.PDF.value:
+            blob = read_potential_broken_pdf(blob)
+
+        location = get_uuid()
+        FileService.put_blob(user_id, location, blob)
+
+        return {
+            "id": location,
+            "name": filename,
+            "size": sys.getsizeof(blob),
+            "extension": filename.split(".")[-1].lower(),
+            "mime_type": content_type,
+            "created_by": user_id,
+            "created_at": time.time(),
+            "preview_url": None,
+        }
+
+    if request.args.get("url"):
+        from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CrawlResult, DefaultMarkdownGenerator, PruningContentFilter
+
+        try:
+            url = request.args.get("url")
+            filename = re.sub(r"\?.*", "", url.split("/")[-1])
+
+            async def adownload():
+                browser_config = BrowserConfig(
+                    headless=True,
+                    verbose=False,
+                )
+                async with AsyncWebCrawler(config=browser_config) as crawler:
+                    crawler_config = CrawlerRunConfig(markdown_generator=DefaultMarkdownGenerator(content_filter=PruningContentFilter()), pdf=True, screenshot=False)
+                    result: CrawlResult = await crawler.arun(url=url, config=crawler_config)
+                    return result
+
+            page = trio.run(adownload())
+            if page.pdf:
+                if filename.split(".")[-1].lower() != "pdf":
+                    filename += ".pdf"
+                return get_json_result(data=structured(filename, "pdf", page.pdf, page.response_headers["content-type"]))
+
+            return get_json_result(data=structured(filename, "html", str(page.markdown).encode("utf-8"), page.response_headers["content-type"], user_id))
+
+        except Exception as e:
+            return server_error_response(e)
+
+    file = request.files["file"]
+    try:
+        DocumentService.check_doc_health(user_id, file.filename)
+        return get_json_result(data=structured(file.filename, filename_type(file.filename), file.read(), file.content_type))
+    except Exception as e:
+        return server_error_response(e)
+
+
+@manager.route("/input_form", methods=["GET"])  # noqa: F821
+@login_required
+def input_form():
+    flow_id = request.args.get("id")
+    cpn_id = request.args.get("component_id")
+    try:
+        e, user_canvas = UserCanvasService.get_by_id(flow_id)
+        if not e:
+            return get_data_error_result(message="canvas not found.")
+        if not UserCanvasService.query(user_id=current_user.id, id=flow_id):
+            return get_json_result(data=False, message="Only owner of canvas authorized for this operation.", code=RetCode.OPERATING_ERROR)
+
+        dataflow = Pipeline(dsl=json.dumps(user_canvas.dsl), tenant_id=current_user.id, flow_id=flow_id, task_id="")
+
+        return get_json_result(data=dataflow.get_component_input_form(cpn_id))
+    except Exception as e:
+        return server_error_response(e)
+
+
+@manager.route("/debug", methods=["POST"])  # noqa: F821
+@validate_request("id", "component_id", "params")
+@login_required
+def debug():
+    req = request.json
+    if not UserCanvasService.accessible(req["id"], current_user.id):
+        return get_json_result(data=False, message="Only owner of canvas authorized for this operation.", code=RetCode.OPERATING_ERROR)
+    try:
+        e, user_canvas = UserCanvasService.get_by_id(req["id"])
+        canvas = Canvas(json.dumps(user_canvas.dsl), current_user.id)
+        canvas.reset()
+        canvas.message_id = get_uuid()
+        component = canvas.get_component(req["component_id"])["obj"]
+        component.reset()
+
+        if isinstance(component, LLM):
+            component.set_debug_inputs(req["params"])
+        component.invoke(**{k: o["value"] for k, o in req["params"].items()})
+        outputs = component.output()
+        for k in outputs.keys():
+            if isinstance(outputs[k], partial):
+                txt = ""
+                for c in outputs[k]():
+                    txt += c
+                outputs[k] = txt
+        return get_json_result(data=outputs)
+    except Exception as e:
+        return server_error_response(e)
+
+
+# api get list version dsl of canvas
+@manager.route("/getlistversion/<canvas_id>", methods=["GET"])  # noqa: F821
+@login_required
+def getlistversion(canvas_id):
+    try:
+        list = sorted([c.to_dict() for c in UserCanvasVersionService.list_by_canvas_id(canvas_id)], key=lambda x: x["update_time"] * -1)
+        return get_json_result(data=list)
+    except Exception as e:
+        return get_data_error_result(message=f"Error getting history files: {e}")
+
+
+# api get version dsl of canvas
+@manager.route("/getversion/<version_id>", methods=["GET"])  # noqa: F821
+@login_required
+def getversion(version_id):
+    try:
+        e, version = UserCanvasVersionService.get_by_id(version_id)
+        if version:
+            return get_json_result(data=version.to_dict())
+    except Exception as e:
+        return get_json_result(data=f"Error getting history file: {e}")
+
+
+@manager.route("/listteam", methods=["GET"])  # noqa: F821
+@login_required
+def list_canvas():
+    keywords = request.args.get("keywords", "")
+    page_number = int(request.args.get("page", 1))
+    items_per_page = int(request.args.get("page_size", 150))
+    orderby = request.args.get("orderby", "create_time")
+    desc = request.args.get("desc", True)
+    try:
+        tenants = TenantService.get_joined_tenants_by_user_id(current_user.id)
+        canvas, total = UserCanvasService.get_by_tenant_ids(
+            [m["tenant_id"] for m in tenants], current_user.id, page_number, items_per_page, orderby, desc, keywords, canvas_category=CanvasCategory.DataFlow
+        )
+        return get_json_result(data={"canvas": canvas, "total": total})
+    except Exception as e:
+        return server_error_response(e)
+
+
+@manager.route("/setting", methods=["POST"])  # noqa: F821
+@validate_request("id", "title", "permission")
+@login_required
+def setting():
+    req = request.json
+    req["user_id"] = current_user.id
+
+    if not UserCanvasService.accessible(req["id"], current_user.id):
+        return get_json_result(data=False, message="Only owner of canvas authorized for this operation.", code=RetCode.OPERATING_ERROR)
+
+    e, flow = UserCanvasService.get_by_id(req["id"])
+    if not e:
+        return get_data_error_result(message="canvas not found.")
+    flow = flow.to_dict()
+    flow["title"] = req["title"]
+    for key in ("description", "permission", "avatar"):
+        if value := req.get(key):
+            flow[key] = value
+
+    num = UserCanvasService.update_by_id(req["id"], flow)
+    return get_json_result(data=num)
+
+
+@manager.route("/trace", methods=["GET"])  # noqa: F821
+def trace():
+    dataflow_id = request.args.get("dataflow_id")
+    task_id = request.args.get("task_id")
+    if not all([dataflow_id, task_id]):
+        return get_data_error_result(message="dataflow_id and task_id are required.")
+
+    e, dataflow_canvas = UserCanvasService.get_by_id(dataflow_id)
+    if not e:
+        return get_data_error_result(message="dataflow not found.")
+
+    dsl_str = json.dumps(dataflow_canvas.dsl, ensure_ascii=False)
+    dataflow = Pipeline(dsl=dsl_str, tenant_id=dataflow_canvas.user_id, flow_id=dataflow_id, task_id=task_id)
+    log = dataflow.fetch_logs()
+
+    return get_json_result(data=log)
--- a/api/apps/document_app.py
+++ b/api/apps/document_app.py
@ -32,7 +32,7 @@ from api.db.services.document_service import DocumentService, doc_upload_and_par
 from api.db.services.file2document_service import File2DocumentService
 from api.db.services.file_service import FileService
 from api.db.services.knowledgebase_service import KnowledgebaseService
-from api.db.services.task_service import TaskService, cancel_all_task_of, queue_tasks, queue_dataflow
+from api.db.services.task_service import TaskService, cancel_all_task_of, queue_tasks
 from api.db.services.user_service import UserTenantService
 from api.utils import get_uuid
 from api.utils.api_utils import (
@ -182,7 +182,6 @@ def create():
                "id": get_uuid(),
                "kb_id": kb.id,
                "parser_id": kb.parser_id,
-                "pipeline_id": kb.pipeline_id,
                "parser_config": kb.parser_config,
                "created_by": current_user.id,
                "type": FileType.VIRTUAL,
@ -480,11 +479,8 @@ def run():
                        kb_table_num_map[kb_id] = count
                        if kb_table_num_map[kb_id] <= 0:
                            KnowledgebaseService.delete_field_map(kb_id)
-                if doc.get("pipeline_id", ""):
-                    queue_dataflow(tenant_id, flow_id=doc["pipeline_id"], task_id=get_uuid(), doc_id=id)
-                else:
-                    bucket, name = File2DocumentService.get_storage_address(doc_id=doc["id"])
-                    queue_tasks(doc, bucket, name, 0)
+                bucket, name = File2DocumentService.get_storage_address(doc_id=doc["id"])
+                queue_tasks(doc, bucket, name, 0)

        return get_json_result(data=True)
    except Exception as e:
@ -550,22 +546,31 @@ def get(doc_id):

@manager.route("/change_parser", methods=["POST"])  # noqa: F821
@login_required
-@validate_request("doc_id")
+@validate_request("doc_id", "parser_id")
 def change_parser():
    req = request.json

    if not DocumentService.accessible(req["doc_id"], current_user.id):
        return get_json_result(data=False, message="No authorization.", code=settings.RetCode.AUTHENTICATION_ERROR)
+    try:
+        e, doc = DocumentService.get_by_id(req["doc_id"])
+        if not e:
+            return get_data_error_result(message="Document not found!")
+        if doc.parser_id.lower() == req["parser_id"].lower():
+            if "parser_config" in req:
+                if req["parser_config"] == doc.parser_config:
+                    return get_json_result(data=True)
+            else:
+                return get_json_result(data=True)

-    e, doc = DocumentService.get_by_id(req["doc_id"])
-    if not e:
-        return get_data_error_result(message="Document not found!")
+        if (doc.type == FileType.VISUAL and req["parser_id"] != "picture") or (re.search(r"\.(ppt|pptx|pages)$", doc.name) and req["parser_id"] != "presentation"):
+            return get_data_error_result(message="Not supported yet!")

-    def reset_doc():
-        nonlocal doc
        e = DocumentService.update_by_id(doc.id, {"parser_id": req["parser_id"], "progress": 0, "progress_msg": "", "run": TaskStatus.UNSTART.value})
        if not e:
            return get_data_error_result(message="Document not found!")
+        if "parser_config" in req:
+            DocumentService.update_parser_config(doc.id, req["parser_config"])
        if doc.token_num > 0:
            e = DocumentService.increment_chunk_num(doc.id, doc.kb_id, doc.token_num * -1, doc.chunk_num * -1, doc.process_duration * -1)
            if not e:
@ -576,26 +581,6 @@ def change_parser():
            if settings.docStoreConn.indexExist(search.index_name(tenant_id), doc.kb_id):
                settings.docStoreConn.delete({"doc_id": doc.id}, search.index_name(tenant_id), doc.kb_id)

-    try:
-        if "pipeline_id" in req:
-            if doc.pipeline_id == req["pipeline_id"]:
-                return get_json_result(data=True)
-            DocumentService.update_by_id(doc.id, {"pipeline_id": req["pipeline_id"]})
-            reset_doc()
-            return get_json_result(data=True)
-
-        if doc.parser_id.lower() == req["parser_id"].lower():
-            if "parser_config" in req:
-                if req["parser_config"] == doc.parser_config:
-                    return get_json_result(data=True)
-            else:
-                return get_json_result(data=True)
-
-        if (doc.type == FileType.VISUAL and req["parser_id"] != "picture") or (re.search(r"\.(ppt|pptx|pages)$", doc.name) and req["parser_id"] != "presentation"):
-            return get_data_error_result(message="Not supported yet!")
-        if "parser_config" in req:
-            DocumentService.update_parser_config(doc.id, req["parser_config"])
-        reset_doc()
        return get_json_result(data=True)
    except Exception as e:
        return server_error_response(e)
--- a/api/apps/file_app.py
+++ b/api/apps/file_app.py
@ -246,8 +246,6 @@ def rm():
                return get_data_error_result(message="File or Folder not found!")
            if not file.tenant_id:
                return get_data_error_result(message="Tenant not found!")
-            if file.tenant_id != current_user.id:
-                return get_json_result(data=False, message='No authorization.', code=settings.RetCode.AUTHENTICATION_ERROR)
            if file.source_type == FileSource.KNOWLEDGEBASE:
                continue

@ -294,8 +292,6 @@ def rename():
        e, file = FileService.get_by_id(req["file_id"])
        if not e:
            return get_data_error_result(message="File not found!")
-        if file.tenant_id != current_user.id:
-            return get_json_result(data=False, message='No authorization.', code=settings.RetCode.AUTHENTICATION_ERROR)
        if file.type != FileType.FOLDER.value \
            and pathlib.Path(req["name"].lower()).suffix != pathlib.Path(
                file.name.lower()).suffix:
@ -332,8 +328,6 @@ def get(file_id):
        e, file = FileService.get_by_id(file_id)
        if not e:
            return get_data_error_result(message="Document not found!")
-        if file.tenant_id != current_user.id:
-            return get_json_result(data=False, message='No authorization.', code=settings.RetCode.AUTHENTICATION_ERROR)

        blob = STORAGE_IMPL.get(file.parent_id, file.location)
        if not blob:
@ -373,8 +367,6 @@ def move():
                return get_data_error_result(message="File or Folder not found!")
            if not file.tenant_id:
                return get_data_error_result(message="Tenant not found!")
-            if file.tenant_id != current_user.id:
-                return get_json_result(data=False, message='No authorization.', code=settings.RetCode.AUTHENTICATION_ERROR)
        fe, _ = FileService.get_by_id(parent_id)
        if not fe:
            return get_data_error_result(message="Parent Folder not found!")
--- a/api/apps/kb_app.py
+++ b/api/apps/kb_app.py
@ -14,21 +14,18 @@
 #  limitations under the License.
 #
 import json
-import logging

 from flask import request
 from flask_login import login_required, current_user

 from api.db.services import duplicate_name
-from api.db.services.document_service import DocumentService, queue_raptor_o_graphrag_tasks
+from api.db.services.document_service import DocumentService
 from api.db.services.file2document_service import File2DocumentService
 from api.db.services.file_service import FileService
-from api.db.services.pipeline_operation_log_service import PipelineOperationLogService
-from api.db.services.task_service import TaskService, GRAPH_RAPTOR_FAKE_DOC_ID
 from api.db.services.user_service import TenantService, UserTenantService
-from api.utils.api_utils import get_error_data_result, server_error_response, get_data_error_result, validate_request, not_allowed_parameters
+from api.utils.api_utils import server_error_response, get_data_error_result, validate_request, not_allowed_parameters, active_required
 from api.utils import get_uuid
-from api.db import PipelineTaskType, StatusEnum, FileSource, VALID_FILE_TYPES, VALID_TASK_STATUS
+from api.db import StatusEnum, FileSource
 from api.db.services.knowledgebase_service import KnowledgebaseService
 from api.db.db_models import File
 from api.utils.api_utils import get_json_result
@ -38,8 +35,10 @@ from api.constants import DATASET_NAME_LIMIT
 from rag.settings import PAGERANK_FLD
 from rag.utils.storage_factory import STORAGE_IMPL

+
@manager.route('/create', methods=['post'])  # noqa: F821
@login_required
+@active_required
@validate_request("name")
 def create():
    req = request.json
@ -63,39 +62,10 @@ def create():
        req["name"] = dataset_name
        req["tenant_id"] = current_user.id
        req["created_by"] = current_user.id
-        if not req.get("parser_id"):
-            req["parser_id"] = "naive"
        e, t = TenantService.get_by_id(current_user.id)
        if not e:
            return get_data_error_result(message="Tenant not found.")
-        req["parser_config"] = {
-            "layout_recognize": "DeepDOC",
-            "chunk_token_num": 512,
-            "delimiter": "\n",
-            "auto_keywords": 0,
-            "auto_questions": 0,
-            "html4excel": False,
-            "topn_tags": 3,
-            "raptor": {
-                "use_raptor": True,
-                "prompt": "Please summarize the following paragraphs. Be careful with the numbers, do not make things up. Paragraphs as following:\n      {cluster_content}\nThe above is the content you need to summarize.",
-                "max_token": 256,
-                "threshold": 0.1,
-                "max_cluster": 64,
-                "random_seed": 0
-            },
-            "graphrag": {
-                "use_graphrag": True,
-                "entity_types": [
-                    "organization",
-                    "person",
-                    "geo",
-                    "event",
-                    "category"
-                ],
-                "method": "light"
-            }
-        }
+        req["embd_id"] = t.embd_id
        if not KnowledgebaseService.save(**req):
            return get_data_error_result()
        return get_json_result(data={"kb_id": req["id"]})
@ -426,352 +396,3 @@ def get_basic_info():
    basic_info = DocumentService.knowledgebase_basic_info(kb_id)

    return get_json_result(data=basic_info)
-
-
-@manager.route("/list_pipeline_logs", methods=["POST"])  # noqa: F821
-@login_required
-def list_pipeline_logs():
-    kb_id = request.args.get("kb_id")
-    if not kb_id:
-        return get_json_result(data=False, message='Lack of "KB ID"', code=settings.RetCode.ARGUMENT_ERROR)
-
-    keywords = request.args.get("keywords", "")
-
-    page_number = int(request.args.get("page", 0))
-    items_per_page = int(request.args.get("page_size", 0))
-    orderby = request.args.get("orderby", "create_time")
-    if request.args.get("desc", "true").lower() == "false":
-        desc = False
-    else:
-        desc = True
-    create_date_from = request.args.get("create_date_from", "")
-    create_date_to = request.args.get("create_date_to", "")
-    if create_date_to > create_date_from:
-        return get_data_error_result(message="Create data filter is abnormal.")
-
-    req = request.get_json()
-
-    operation_status = req.get("operation_status", [])
-    if operation_status:
-        invalid_status = {s for s in operation_status if s not in VALID_TASK_STATUS}
-        if invalid_status:
-            return get_data_error_result(message=f"Invalid filter operation_status status conditions: {', '.join(invalid_status)}")
-
-    types = req.get("types", [])
-    if types:
-        invalid_types = {t for t in types if t not in VALID_FILE_TYPES}
-        if invalid_types:
-            return get_data_error_result(message=f"Invalid filter conditions: {', '.join(invalid_types)} type{'s' if len(invalid_types) > 1 else ''}")
-
-    suffix = req.get("suffix", [])
-
-    try:
-        logs, tol = PipelineOperationLogService.get_file_logs_by_kb_id(kb_id, page_number, items_per_page, orderby, desc, keywords, operation_status, types, suffix, create_date_from, create_date_to)
-        return get_json_result(data={"total": tol, "logs": logs})
-    except Exception as e:
-        return server_error_response(e)
-
-
-@manager.route("/list_pipeline_dataset_logs", methods=["POST"])  # noqa: F821
-@login_required
-def list_pipeline_dataset_logs():
-    kb_id = request.args.get("kb_id")
-    if not kb_id:
-        return get_json_result(data=False, message='Lack of "KB ID"', code=settings.RetCode.ARGUMENT_ERROR)
-
-    page_number = int(request.args.get("page", 0))
-    items_per_page = int(request.args.get("page_size", 0))
-    orderby = request.args.get("orderby", "create_time")
-    if request.args.get("desc", "true").lower() == "false":
-        desc = False
-    else:
-        desc = True
-    create_date_from = request.args.get("create_date_from", "")
-    create_date_to = request.args.get("create_date_to", "")
-    if create_date_to > create_date_from:
-        return get_data_error_result(message="Create data filter is abnormal.")
-
-    req = request.get_json()
-
-    operation_status = req.get("operation_status", [])
-    if operation_status:
-        invalid_status = {s for s in operation_status if s not in VALID_TASK_STATUS}
-        if invalid_status:
-            return get_data_error_result(message=f"Invalid filter operation_status status conditions: {', '.join(invalid_status)}")
-
-    try:
-        logs, tol = PipelineOperationLogService.get_dataset_logs_by_kb_id(kb_id, page_number, items_per_page, orderby, desc, operation_status, create_date_from, create_date_to)
-        return get_json_result(data={"total": tol, "logs": logs})
-    except Exception as e:
-        return server_error_response(e)
-
-
-@manager.route("/delete_pipeline_logs", methods=["POST"])  # noqa: F821
-@login_required
-def delete_pipeline_logs():
-    kb_id = request.args.get("kb_id")
-    if not kb_id:
-        return get_json_result(data=False, message='Lack of "KB ID"', code=settings.RetCode.ARGUMENT_ERROR)
-
-    req = request.get_json()
-    log_ids = req.get("log_ids", [])
-
-    PipelineOperationLogService.delete_by_ids(log_ids)
-
-    return get_json_result(data=True)
-
-
-@manager.route("/pipeline_log_detail", methods=["GET"])  # noqa: F821
-@login_required
-def pipeline_log_detail():
-    log_id = request.args.get("log_id")
-    if not log_id:
-        return get_json_result(data=False, message='Lack of "Pipeline log ID"', code=settings.RetCode.ARGUMENT_ERROR)
-
-    ok, log = PipelineOperationLogService.get_by_id(log_id)
-    if not ok:
-        return get_data_error_result(message="Invalid pipeline log ID")
-
-    return get_json_result(data=log.to_dict())
-
-
-@manager.route("/run_graphrag", methods=["POST"])  # noqa: F821
-@login_required
-def run_graphrag():
-    req = request.json
-
-    kb_id = req.get("kb_id", "")
-    if not kb_id:
-        return get_error_data_result(message='Lack of "KB ID"')
-
-    ok, kb = KnowledgebaseService.get_by_id(kb_id)
-    if not ok:
-        return get_error_data_result(message="Invalid Knowledgebase ID")
-
-    task_id = kb.graphrag_task_id
-    if task_id:
-        ok, task = TaskService.get_by_id(task_id)
-        if not ok:
-            logging.warning(f"A valid GraphRAG task id is expected for kb {kb_id}")
-
-        if task and task.progress not in [-1, 1]:
-            return get_error_data_result(message=f"Task {task_id} in progress with status {task.progress}. A Graph Task is already running.")
-
-    documents, _ = DocumentService.get_by_kb_id(
-        kb_id=kb_id,
-        page_number=0,
-        items_per_page=0,
-        orderby="create_time",
-        desc=False,
-        keywords="",
-        run_status=[],
-        types=[],
-        suffix=[],
-    )
-    if not documents:
-        return get_error_data_result(message=f"No documents in Knowledgebase {kb_id}")
-
-    sample_document = documents[0]
-    document_ids = [document["id"] for document in documents]
-
-    task_id = queue_raptor_o_graphrag_tasks(doc=sample_document, ty="graphrag", priority=0, fake_doc_id=GRAPH_RAPTOR_FAKE_DOC_ID, doc_ids=list(document_ids))
-
-    if not KnowledgebaseService.update_by_id(kb.id, {"graphrag_task_id": task_id}):
-        logging.warning(f"Cannot save graphrag_task_id for kb {kb_id}")
-
-    return get_json_result(data={"graphrag_task_id": task_id})
-
-
-@manager.route("/trace_graphrag", methods=["GET"])  # noqa: F821
-@login_required
-def trace_graphrag():
-    kb_id = request.args.get("kb_id", "")
-    if not kb_id:
-        return get_error_data_result(message='Lack of "KB ID"')
-
-    ok, kb = KnowledgebaseService.get_by_id(kb_id)
-    if not ok:
-        return get_error_data_result(message="Invalid Knowledgebase ID")
-
-    task_id = kb.graphrag_task_id
-    if not task_id:
-        return get_json_result(data={})
-
-    ok, task = TaskService.get_by_id(task_id)
-    if not ok:
-        return get_error_data_result(message="GraphRAG Task Not Found or Error Occurred")
-
-    return get_json_result(data=task.to_dict())
-
-
-@manager.route("/run_raptor", methods=["POST"])  # noqa: F821
-@login_required
-def run_raptor():
-    req = request.json
-
-    kb_id = req.get("kb_id", "")
-    if not kb_id:
-        return get_error_data_result(message='Lack of "KB ID"')
-
-    ok, kb = KnowledgebaseService.get_by_id(kb_id)
-    if not ok:
-        return get_error_data_result(message="Invalid Knowledgebase ID")
-
-    task_id = kb.raptor_task_id
-    if task_id:
-        ok, task = TaskService.get_by_id(task_id)
-        if not ok:
-            logging.warning(f"A valid RAPTOR task id is expected for kb {kb_id}")
-
-        if task and task.progress not in [-1, 1]:
-            return get_error_data_result(message=f"Task {task_id} in progress with status {task.progress}. A RAPTOR Task is already running.")
-
-    documents, _ = DocumentService.get_by_kb_id(
-        kb_id=kb_id,
-        page_number=0,
-        items_per_page=0,
-        orderby="create_time",
-        desc=False,
-        keywords="",
-        run_status=[],
-        types=[],
-        suffix=[],
-    )
-    if not documents:
-        return get_error_data_result(message=f"No documents in Knowledgebase {kb_id}")
-
-    sample_document = documents[0]
-    document_ids = [document["id"] for document in documents]
-
-    task_id = queue_raptor_o_graphrag_tasks(doc=sample_document, ty="raptor", priority=0, fake_doc_id=GRAPH_RAPTOR_FAKE_DOC_ID, doc_ids=list(document_ids))
-
-    if not KnowledgebaseService.update_by_id(kb.id, {"raptor_task_id": task_id}):
-        logging.warning(f"Cannot save raptor_task_id for kb {kb_id}")
-
-    return get_json_result(data={"raptor_task_id": task_id})
-
-
-@manager.route("/trace_raptor", methods=["GET"])  # noqa: F821
-@login_required
-def trace_raptor():
-    kb_id = request.args.get("kb_id", "")
-    if not kb_id:
-        return get_error_data_result(message='Lack of "KB ID"')
-
-    ok, kb = KnowledgebaseService.get_by_id(kb_id)
-    if not ok:
-        return get_error_data_result(message="Invalid Knowledgebase ID")
-
-    task_id = kb.raptor_task_id
-    if not task_id:
-        return get_json_result(data={})
-
-    ok, task = TaskService.get_by_id(task_id)
-    if not ok:
-        return get_error_data_result(message="RAPTOR Task Not Found or Error Occurred")
-
-    return get_json_result(data=task.to_dict())
-
-
-@manager.route("/run_mindmap", methods=["POST"])  # noqa: F821
-@login_required
-def run_mindmap():
-    req = request.json
-
-    kb_id = req.get("kb_id", "")
-    if not kb_id:
-        return get_error_data_result(message='Lack of "KB ID"')
-
-    ok, kb = KnowledgebaseService.get_by_id(kb_id)
-    if not ok:
-        return get_error_data_result(message="Invalid Knowledgebase ID")
-
-    task_id = kb.mindmap_task_id
-    if task_id:
-        ok, task = TaskService.get_by_id(task_id)
-        if not ok:
-            logging.warning(f"A valid Mindmap task id is expected for kb {kb_id}")
-
-        if task and task.progress not in [-1, 1]:
-            return get_error_data_result(message=f"Task {task_id} in progress with status {task.progress}. A Mindmap Task is already running.")
-
-    documents, _ = DocumentService.get_by_kb_id(
-        kb_id=kb_id,
-        page_number=0,
-        items_per_page=0,
-        orderby="create_time",
-        desc=False,
-        keywords="",
-        run_status=[],
-        types=[],
-        suffix=[],
-    )
-    if not documents:
-        return get_error_data_result(message=f"No documents in Knowledgebase {kb_id}")
-
-    sample_document = documents[0]
-    document_ids = [document["id"] for document in documents]
-
-    task_id = queue_raptor_o_graphrag_tasks(doc=sample_document, ty="mindmap", priority=0, fake_doc_id=GRAPH_RAPTOR_FAKE_DOC_ID, doc_ids=list(document_ids))
-
-    if not KnowledgebaseService.update_by_id(kb.id, {"mindmap_task_id": task_id}):
-        logging.warning(f"Cannot save mindmap_task_id for kb {kb_id}")
-
-    return get_json_result(data={"mindmap_task_id": task_id})
-
-
-@manager.route("/trace_mindmap", methods=["GET"])  # noqa: F821
-@login_required
-def trace_mindmap():
-    kb_id = request.args.get("kb_id", "")
-    if not kb_id:
-        return get_error_data_result(message='Lack of "KB ID"')
-
-    ok, kb = KnowledgebaseService.get_by_id(kb_id)
-    if not ok:
-        return get_error_data_result(message="Invalid Knowledgebase ID")
-
-    task_id = kb.mindmap_task_id
-    if not task_id:
-        return get_json_result(data={})
-
-    ok, task = TaskService.get_by_id(task_id)
-    if not ok:
-        return get_error_data_result(message="Mindmap Task Not Found or Error Occurred")
-
-    return get_json_result(data=task.to_dict())
-
-
-@manager.route("/unbind_task", methods=["DELETE"])  # noqa: F821
-@login_required
-def delete_kb_task():
-    kb_id = request.args.get("kb_id", "")
-    if not kb_id:
-        return get_error_data_result(message='Lack of "KB ID"')
-    ok, kb = KnowledgebaseService.get_by_id(kb_id)
-    if not ok:
-        return get_json_result(data=True)
-
-    pipeline_task_type = request.args.get("pipeline_task_type", "")
-    if not pipeline_task_type or pipeline_task_type not in [PipelineTaskType.GRAPH_RAG, PipelineTaskType.RAPTOR, PipelineTaskType.MINDMAP]:
-        return get_error_data_result(message="Invalid task type")
-
-    match pipeline_task_type:
-        case PipelineTaskType.GRAPH_RAG:
-            settings.docStoreConn.delete({"knowledge_graph_kwd": ["graph", "subgraph", "entity", "relation"]}, search.index_name(kb.tenant_id), kb_id)
-            kb_task_id = "graphrag_task_id"
-            kb_task_finish_at = "graphrag_task_finish_at"
-        case PipelineTaskType.RAPTOR:
-            kb_task_id = "raptor_task_id"
-            kb_task_finish_at = "raptor_task_finish_at"
-        case PipelineTaskType.MINDMAP:
-            kb_task_id = "mindmap_task_id"
-            kb_task_finish_at = "mindmap_task_finish_at"
-        case _:
-            return get_error_data_result(message="Internal Error: Invalid task type")
-
-    ok = KnowledgebaseService.update_by_id(kb_id, {kb_task_id: "", kb_task_finish_at: None})
-    if not ok:
-        return server_error_response(f"Internal error: cannot delete task {pipeline_task_type}")
-
-    return get_json_result(data=True)
--- a/api/apps/system_app.py
+++ b/api/apps/system_app.py
@ -39,7 +39,6 @@ from rag.utils.redis_conn import REDIS_CONN
 from flask import jsonify
 from api.utils.health_utils import run_health_checks

-
@manager.route("/version", methods=["GET"])  # noqa: F821
@login_required
 def version():
--- a/api/apps/user_app.py
+++ b/api/apps/user_app.py
@ -98,14 +98,7 @@ def login():
        return get_json_result(data=False, code=settings.RetCode.SERVER_ERROR, message="Fail to crypt password")

    user = UserService.query_user(email, password)
-
-    if user and hasattr(user, 'is_active') and user.is_active == "0":
-        return get_json_result(
-            data=False,
-            code=settings.RetCode.FORBIDDEN,
-            message="This account has been disabled, please contact the administrator!",
-        )
-    elif user:
+    if user:
        response_data = user.to_json()
        user.access_token = get_uuid()
        login_user(user)
@ -234,9 +227,6 @@ def oauth_callback(channel):
        # User exists, try to log in
        user = users[0]
        user.access_token = get_uuid()
-        if user and hasattr(user, 'is_active') and user.is_active == "0":
-            return redirect("/?error=user_inactive")
-
        login_user(user)
        user.save()
        return redirect(f"/?auth={user.get_id()}")
@ -327,8 +317,6 @@ def github_callback():
    # User has already registered, try to log in
    user = users[0]
    user.access_token = get_uuid()
-    if user and hasattr(user, 'is_active') and user.is_active == "0":
-        return redirect("/?error=user_inactive")
    login_user(user)
    user.save()
    return redirect("/?auth=%s" % user.get_id())
@ -430,8 +418,6 @@ def feishu_callback():

    # User has already registered, try to log in
    user = users[0]
-    if user and hasattr(user, 'is_active') and user.is_active == "0":
-        return redirect("/?error=user_inactive")
    user.access_token = get_uuid()
    login_user(user)
    user.save()
--- a/api/common/README.md
+++ b/api/common/README.md
@ -1,2 +0,0 @@
-The python files in this directory are shared between service. They contain common utilities, models, and functions that can be used across various
-services to ensure consistency and reduce code duplication.
--- a/api/common/base64.py
+++ b/api/common/base64.py
@ -1,21 +0,0 @@
-#
-#  Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-#
-
-import base64
-
-def encode_to_base64(input_string):
-    base64_encoded = base64.b64encode(input_string.encode('utf-8'))
-    return base64_encoded.decode('utf-8')
--- a/api/db/init.py
+++ b/api/db/init.py
@ -127,15 +127,4 @@ class MCPServerType(StrEnum):
 VALID_MCP_SERVER_TYPES = {MCPServerType.SSE, MCPServerType.STREAMABLE_HTTP}


-class PipelineTaskType(StrEnum):
-    PARSE = "Parse"
-    DOWNLOAD = "Download"
-    RAPTOR = "RAPTOR"
-    GRAPH_RAG = "GraphRAG"
-    MINDMAP = "Mindmap"
-
-
-VALID_PIPELINE_TASK_TYPES = {PipelineTaskType.PARSE, PipelineTaskType.DOWNLOAD, PipelineTaskType.RAPTOR, PipelineTaskType.GRAPH_RAG, PipelineTaskType.MINDMAP}
-
-
 KNOWLEDGEBASE_FOLDER_NAME=".knowledgebase"
--- a/api/db/db_models.py
+++ b/api/db/db_models.py
@ -684,17 +684,8 @@ class Knowledgebase(DataBaseModel):
    vector_similarity_weight = FloatField(default=0.3, index=True)

    parser_id = CharField(max_length=32, null=False, help_text="default parser ID", default=ParserType.NAIVE.value, index=True)
-    pipeline_id = CharField(max_length=32, null=True, help_text="Pipeline ID", index=True)
    parser_config = JSONField(null=False, default={"pages": [[1, 1000000]]})
    pagerank = IntegerField(default=0, index=False)
-
-    graphrag_task_id = CharField(max_length=32, null=True, help_text="Graph RAG task ID", index=True)
-    graphrag_task_finish_at = DateTimeField(null=True)
-    raptor_task_id = CharField(max_length=32, null=True, help_text="RAPTOR task ID", index=True)
-    raptor_task_finish_at = DateTimeField(null=True)
-    mindmap_task_id = CharField(max_length=32, null=True, help_text="Mindmap task ID", index=True)
-    mindmap_task_finish_at = DateTimeField(null=True)
-
    status = CharField(max_length=1, null=True, help_text="is it validate(0: wasted, 1: validate)", default="1", index=True)

    def __str__(self):
@ -709,7 +700,6 @@ class Document(DataBaseModel):
    thumbnail = TextField(null=True, help_text="thumbnail base64 string")
    kb_id = CharField(max_length=256, null=False, index=True)
    parser_id = CharField(max_length=32, null=False, help_text="default parser ID", index=True)
-    pipeline_id = CharField(max_length=32, null=True, help_text="pipleline ID", index=True)
    parser_config = JSONField(null=False, default={"pages": [[1, 1000000]]})
    source_type = CharField(max_length=128, null=False, default="local", help_text="where dose this document come from", index=True)
    type = CharField(max_length=32, null=False, help_text="file extension", index=True)
@ -952,32 +942,6 @@ class Search(DataBaseModel):
        db_table = "search"


-class PipelineOperationLog(DataBaseModel):
-    id = CharField(max_length=32, primary_key=True)
-    document_id = CharField(max_length=32, index=True)
-    tenant_id = CharField(max_length=32, null=False, index=True)
-    kb_id = CharField(max_length=32, null=False, index=True)
-    pipeline_id = CharField(max_length=32, null=True, help_text="Pipeline ID", index=True)
-    pipeline_title = CharField(max_length=32, null=True, help_text="Pipeline title", index=True)
-    parser_id = CharField(max_length=32, null=False, help_text="Parser ID", index=True)
-    document_name = CharField(max_length=255, null=False, help_text="File name")
-    document_suffix = CharField(max_length=255, null=False, help_text="File suffix")
-    document_type = CharField(max_length=255, null=False, help_text="Document type")
-    source_from = CharField(max_length=255, null=False, help_text="Source")
-    progress = FloatField(default=0, index=True)
-    progress_msg = TextField(null=True, help_text="process message", default="")
-    process_begin_at = DateTimeField(null=True, index=True)
-    process_duration = FloatField(default=0)
-    dsl = JSONField(null=True, default=dict)
-    task_type = CharField(max_length=32, null=False, default="")
-    operation_status = CharField(max_length=32, null=False, help_text="Operation status")
-    avatar = TextField(null=True, help_text="avatar base64 string")
-    status = CharField(max_length=1, null=True, help_text="is it validate(0: wasted, 1: validate)", default="1", index=True)
-
-    class Meta:
-        db_table = "pipeline_operation_log"
-
-
 def migrate_db():
    logging.disable(logging.ERROR)
    migrator = DatabaseMigrator[settings.DATABASE_TYPE.upper()].value(DB)
@ -1094,6 +1058,7 @@ def migrate_db():
        migrate(migrator.add_column("dialog", "meta_data_filter", JSONField(null=True, default={})))
    except Exception:
        pass
+
    try:
        migrate(migrator.alter_column_type("canvas_template", "title", JSONField(null=True, default=dict, help_text="Canvas title")))
    except Exception:
@ -1110,36 +1075,4 @@ def migrate_db():
        migrate(migrator.add_column("canvas_template", "canvas_category", CharField(max_length=32, null=False, default="agent_canvas", help_text="agent_canvas|dataflow_canvas", index=True)))
    except Exception:
        pass
-    try:
-        migrate(migrator.add_column("knowledgebase", "pipeline_id", CharField(max_length=32, null=True, help_text="Pipeline ID", index=True)))
-    except Exception:
-        pass
-    try:
-        migrate(migrator.add_column("document", "pipeline_id", CharField(max_length=32, null=True, help_text="Pipeline ID", index=True)))
-    except Exception:
-        pass
-    try:
-        migrate(migrator.add_column("knowledgebase", "graphrag_task_id", CharField(max_length=32, null=True, help_text="Gragh RAG task ID", index=True)))
-    except Exception:
-        pass
-    try:
-        migrate(migrator.add_column("knowledgebase", "raptor_task_id", CharField(max_length=32, null=True, help_text="RAPTOR task ID", index=True)))
-    except Exception:
-        pass
-    try:
-        migrate(migrator.add_column("knowledgebase", "graphrag_task_finish_at", DateTimeField(null=True)))
-    except Exception:
-        pass
-    try:
-        migrate(migrator.add_column("knowledgebase", "raptor_task_finish_at", CharField(null=True)))
-    except Exception:
-        pass
-    try:
-        migrate(migrator.add_column("knowledgebase", "mindmap_task_id", CharField(max_length=32, null=True, help_text="Mindmap task ID", index=True)))
-    except Exception:
-        pass
-    try:
-        migrate(migrator.add_column("knowledgebase", "mindmap_task_finish_at", CharField(null=True)))
-    except Exception:
-        pass
    logging.disable(logging.NOTSET)
--- a/api/db/init_data.py
+++ b/api/db/init_data.py
@ -14,6 +14,7 @@
 #  limitations under the License.
 #
 import logging
+import base64
 import json
 import os
 import time
@ -31,7 +32,11 @@ from api.db.services.llm_service import LLMService, LLMBundle, get_init_tenant_l
 from api.db.services.user_service import TenantService, UserTenantService
 from api import settings
 from api.utils.file_utils import get_project_base_directory
-from api.common.base64 import encode_to_base64
+
+
+def encode_to_base64(input_string):
+    base64_encoded = base64.b64encode(input_string.encode('utf-8'))
+    return base64_encoded.decode('utf-8')


 def init_superuser():
--- a/api/db/joint_services/user_account_service.py
+++ b/api/db/joint_services/user_account_service.py
@ -17,26 +17,13 @@ import logging
 import uuid

 from api import settings
-from api.utils.api_utils import group_by
-from api.db import FileType, UserTenantRole, ActiveEnum
-from api.db.services.api_service import APITokenService, API4ConversationService
-from api.db.services.canvas_service import UserCanvasService
-from api.db.services.conversation_service import ConversationService
-from api.db.services.dialog_service import DialogService
-from api.db.services.document_service import DocumentService
-from api.db.services.file2document_service import File2DocumentService
-from api.db.services.knowledgebase_service import KnowledgebaseService
-from api.db.services.langfuse_service import TenantLangfuseService
+from api.db import FileType, UserTenantRole
+from api.db.db_models import TenantLLM
 from api.db.services.llm_service import get_init_tenant_llm
 from api.db.services.file_service import FileService
-from api.db.services.mcp_server_service import MCPServerService
-from api.db.services.search_service import SearchService
-from api.db.services.task_service import TaskService
 from api.db.services.tenant_llm_service import TenantLLMService
-from api.db.services.user_canvas_version import UserCanvasVersionService
 from api.db.services.user_service import TenantService, UserService, UserTenantService
-from rag.utils.storage_factory import STORAGE_IMPL
-from rag.nlp import search
+


 def create_new_user(user_info: dict) -> dict:
@ -117,7 +104,7 @@ def create_new_user(user_info: dict) -> dict:
        except Exception as e:
            logging.exception(e)
        try:
-            TenantLLMService.delete_by_tenant_id(user_id)
+            TenantLLM.delete().where(TenantLLM.tenant_id == user_id).execute()
        except Exception as e:
            logging.exception(e)
        try:
@ -131,197 +118,3 @@ def create_new_user(user_info: dict) -> dict:
            logging.exception(e)
        # reraise
        raise create_error
-
-
-def delete_user_data(user_id: str) -> dict:
-    # use user_id to delete
-    usr = UserService.filter_by_id(user_id)
-    if not usr:
-        return {"success": False, "message": f"{user_id} can't be found."}
-    # check is inactive and not admin
-    if usr.is_active == ActiveEnum.ACTIVE.value:
-        return {"success": False, "message": f"{user_id} is active and can't be deleted."}
-    if usr.is_superuser:
-        return {"success": False, "message": "Can't delete the super user."}
-    # tenant info
-    tenants = UserTenantService.get_user_tenant_relation_by_user_id(usr.id)
-    owned_tenant = [t for t in tenants if t["role"] == UserTenantRole.OWNER.value]
-
-    done_msg = ''
-    try:
-        # step1. delete owned tenant info
-        if owned_tenant:
-            done_msg += "Start to delete owned tenant.\n"
-            tenant_id = owned_tenant[0]["tenant_id"]
-            kb_ids = KnowledgebaseService.get_kb_ids(usr.id)
-            # step1.1 delete knowledgebase related file and info
-            if kb_ids:
-                # step1.1.1 delete files in storage, remove bucket
-                for kb_id in kb_ids:
-                    if STORAGE_IMPL.bucket_exists(kb_id):
-                        STORAGE_IMPL.remove_bucket(kb_id)
-                done_msg += f"- Removed {len(kb_ids)} dataset's buckets.\n"
-                # step1.1.2 delete file and document info in db
-                doc_ids = DocumentService.get_all_doc_ids_by_kb_ids(kb_ids)
-                if doc_ids:
-                    doc_delete_res = DocumentService.delete_by_ids([i["id"] for i in doc_ids])
-                    done_msg += f"- Deleted {doc_delete_res} document records.\n"
-                    task_delete_res = TaskService.delete_by_doc_ids([i["id"] for i in doc_ids])
-                    done_msg += f"- Deleted {task_delete_res} task records.\n"
-                file_ids = FileService.get_all_file_ids_by_tenant_id(usr.id)
-                if file_ids:
-                    file_delete_res = FileService.delete_by_ids([f["id"] for f in file_ids])
-                    done_msg += f"- Deleted {file_delete_res} file records.\n"
-                if doc_ids or file_ids:
-                    file2doc_delete_res = File2DocumentService.delete_by_document_ids_or_file_ids(
-                        [i["id"] for i in doc_ids],
-                        [f["id"] for f in file_ids]
-                    )
-                    done_msg += f"- Deleted {file2doc_delete_res} document-file relation records.\n"
-                # step1.1.3 delete chunk in es
-                r = settings.docStoreConn.delete({"kb_id": kb_ids},
-                                         search.index_name(tenant_id), kb_ids)
-                done_msg += f"- Deleted {r} chunk records.\n"
-                kb_delete_res = KnowledgebaseService.delete_by_ids(kb_ids)
-                done_msg += f"- Deleted {kb_delete_res} knowledgebase records.\n"
-                # step1.1.4 delete agents
-                agent_delete_res = delete_user_agents(usr.id)
-                done_msg += f"- Deleted {agent_delete_res['agents_deleted_count']} agent, {agent_delete_res['version_deleted_count']} versions records.\n"
-                # step1.1.5 delete dialogs
-                dialog_delete_res = delete_user_dialogs(usr.id)
-                done_msg += f"- Deleted {dialog_delete_res['dialogs_deleted_count']} dialogs, {dialog_delete_res['conversations_deleted_count']} conversations, {dialog_delete_res['api_token_deleted_count']} api tokens, {dialog_delete_res['api4conversation_deleted_count']} api4conversations.\n"
-                # step1.1.6 delete mcp server
-                mcp_delete_res = MCPServerService.delete_by_tenant_id(usr.id)
-                done_msg += f"- Deleted {mcp_delete_res} MCP server.\n"
-                # step1.1.7 delete search
-                search_delete_res = SearchService.delete_by_tenant_id(usr.id)
-                done_msg += f"- Deleted {search_delete_res} search records.\n"
-            # step1.2 delete tenant_llm and tenant_langfuse
-            llm_delete_res = TenantLLMService.delete_by_tenant_id(tenant_id)
-            done_msg += f"- Deleted {llm_delete_res} tenant-LLM records.\n"
-            langfuse_delete_res = TenantLangfuseService.delete_ty_tenant_id(tenant_id)
-            done_msg += f"- Deleted {langfuse_delete_res} langfuse records.\n"
-            # step1.3 delete own tenant
-            tenant_delete_res = TenantService.delete_by_id(tenant_id)
-            done_msg += f"- Deleted {tenant_delete_res} tenant.\n"
-        # step2 delete user-tenant relation
-        if tenants:
-            # step2.1 delete docs and files in joined team
-            joined_tenants = [t for t in tenants if t["role"] == UserTenantRole.NORMAL.value]
-            if joined_tenants:
-                done_msg += "Start to delete data in joined tenants.\n"
-                created_documents = DocumentService.get_all_docs_by_creator_id(usr.id)
-                if created_documents:
-                    # step2.1.1 delete files
-                    doc_file_info = File2DocumentService.get_by_document_ids([d['id'] for d in created_documents])
-                    created_files = FileService.get_by_ids([f['file_id'] for f in doc_file_info])
-                    if created_files:
-                        # step2.1.1.1 delete file in storage
-                        for f in created_files:
-                            STORAGE_IMPL.rm(f.parent_id, f.location)
-                        done_msg += f"- Deleted {len(created_files)} uploaded file.\n"
-                        # step2.1.1.2 delete file record
-                        file_delete_res = FileService.delete_by_ids([f.id for f in created_files])
-                        done_msg += f"- Deleted {file_delete_res} file records.\n"
-                    # step2.1.2 delete document-file relation record
-                    file2doc_delete_res = File2DocumentService.delete_by_document_ids_or_file_ids(
-                        [d['id'] for d in created_documents],
-                        [f.id for f in created_files]
-                    )
-                    done_msg += f"- Deleted {file2doc_delete_res} document-file relation records.\n"
-                    # step2.1.3 delete chunks
-                    doc_groups = group_by(created_documents, "tenant_id")
-                    kb_grouped_doc = {k: group_by(v, "kb_id") for k, v in doc_groups.items()}
-                    # chunks in {'tenant_id': {'kb_id': [{'id': doc_id}]}} structure
-                    chunk_delete_res = 0
-                    kb_doc_info = {}
-                    for _tenant_id, kb_doc in kb_grouped_doc.items():
-                        for _kb_id, docs in kb_doc.items():
-                            chunk_delete_res += settings.docStoreConn.delete(
-                                {"doc_id": [d["id"] for d in docs]},
-                                search.index_name(_tenant_id), _kb_id
-                            )
-                            # record doc info
-                            if _kb_id in kb_doc_info.keys():
-                                kb_doc_info[_kb_id]['doc_num'] += 1
-                                kb_doc_info[_kb_id]['token_num'] += sum([d["token_num"] for d in docs])
-                                kb_doc_info[_kb_id]['chunk_num'] += sum([d["chunk_num"] for d in docs])
-                            else:
-                                kb_doc_info[_kb_id] = {
-                                    'doc_num': 1,
-                                    'token_num': sum([d["token_num"] for d in docs]),
-                                    'chunk_num': sum([d["chunk_num"] for d in docs])
-                                }
-                    done_msg += f"- Deleted {chunk_delete_res} chunks.\n"
-                    # step2.1.4 delete tasks
-                    task_delete_res = TaskService.delete_by_doc_ids([d['id'] for d in created_documents])
-                    done_msg += f"- Deleted {task_delete_res} tasks.\n"
-                    # step2.1.5 delete document record
-                    doc_delete_res = DocumentService.delete_by_ids([d['id'] for d in created_documents])
-                    done_msg += f"- Deleted {doc_delete_res} documents.\n"
-                    # step2.1.6 update knowledge base doc&chunk&token cnt
-                    for kb_id, doc_num in kb_doc_info.items():
-                        KnowledgebaseService.decrease_document_num_in_delete(kb_id, doc_num)
-
-            # step2.2 delete relation
-            user_tenant_delete_res = UserTenantService.delete_by_ids([t["id"] for t in tenants])
-            done_msg += f"- Deleted {user_tenant_delete_res} user-tenant records.\n"
-        # step3 finally delete user
-        user_delete_res = UserService.delete_by_id(usr.id)
-        done_msg += f"- Deleted {user_delete_res} user.\nDelete done!"
-
-        return {"success": True, "message": f"Successfully deleted user. Details:\n{done_msg}"}
-
-    except Exception as e:
-        logging.exception(e)
-        return {"success": False, "message": f"Error: {str(e)}. Already done:\n{done_msg}"}
-
-
-def delete_user_agents(user_id: str) -> dict:
-    """
-    use user_id to delete
-    :return: {
-        "agents_deleted_count": 1,
-        "version_deleted_count": 2
-    }
-    """
-    agents_deleted_count, agents_version_deleted_count = 0, 0
-    user_agents = UserCanvasService.get_all_agents_by_tenant_ids([user_id], user_id)
-    if user_agents:
-        agents_version = UserCanvasVersionService.get_all_canvas_version_by_canvas_ids([a['id'] for a in user_agents])
-        agents_version_deleted_count = UserCanvasVersionService.delete_by_ids([v['id'] for v in agents_version])
-        agents_deleted_count = UserCanvasService.delete_by_ids([a['id'] for a in user_agents])
-    return {
-        "agents_deleted_count": agents_deleted_count,
-        "version_deleted_count": agents_version_deleted_count
-    }
-
-
-def delete_user_dialogs(user_id: str) -> dict:
-    """
-    use user_id to delete
-    :return: {
-        "dialogs_deleted_count": 1,
-        "conversations_deleted_count": 1,
-        "api_token_deleted_count": 2,
-        "api4conversation_deleted_count": 2
-    }
-    """
-    dialog_deleted_count, conversations_deleted_count, api_token_deleted_count, api4conversation_deleted_count = 0, 0, 0, 0
-    user_dialogs = DialogService.get_all_dialogs_by_tenant_id(user_id)
-    if user_dialogs:
-        # delete conversation
-        conversations = ConversationService.get_all_conversation_by_dialog_ids([ud['id'] for ud in user_dialogs])
-        conversations_deleted_count = ConversationService.delete_by_ids([c['id'] for c in conversations])
-        # delete api token
-        api_token_deleted_count = APITokenService.delete_by_tenant_id(user_id)
-        # delete api for conversation
-        api4conversation_deleted_count = API4ConversationService.delete_by_dialog_ids([ud['id'] for ud in user_dialogs])
-        # delete dialog at last
-        dialog_deleted_count = DialogService.delete_by_ids([ud['id'] for ud in user_dialogs])
-    return {
-        "dialogs_deleted_count": dialog_deleted_count,
-        "conversations_deleted_count": conversations_deleted_count,
-        "api_token_deleted_count": api_token_deleted_count,
-        "api4conversation_deleted_count": api4conversation_deleted_count
-    }
--- a/api/db/services/api_service.py
+++ b/api/db/services/api_service.py
@ -35,11 +35,6 @@ class APITokenService(CommonService):
            cls.model.token == token
        )

-    @classmethod
-    @DB.connection_context()
-    def delete_by_tenant_id(cls, tenant_id):
-        return cls.model.delete().where(cls.model.tenant_id == tenant_id).execute()
-

 class API4ConversationService(CommonService):
    model = API4Conversation
@ -105,8 +100,3 @@ class API4ConversationService(CommonService):
            cls.model.create_date <= to_date,
            cls.model.source == source
        ).group_by(cls.model.create_date.truncate("day")).dicts()
-
-    @classmethod
-    @DB.connection_context()
-    def delete_by_dialog_ids(cls, dialog_ids):
-        return cls.model.delete().where(cls.model.dialog_id.in_(dialog_ids)).execute()
--- a/api/db/services/canvas_service.py
+++ b/api/db/services/canvas_service.py
@ -66,7 +66,6 @@ class UserCanvasService(CommonService):
    def get_all_agents_by_tenant_ids(cls, tenant_ids, user_id):
        # will get all permitted agents, be cautious
        fields = [
-            cls.model.id,
            cls.model.title,
            cls.model.permission,
            cls.model.canvas_type,
@ -94,7 +93,7 @@ class UserCanvasService(CommonService):

    @classmethod
    @DB.connection_context()
-    def get_by_canvas_id(cls, pid):
+    def get_by_tenant_id(cls, pid):
        try:

            fields = [
@ -126,7 +125,7 @@ class UserCanvasService(CommonService):
    @DB.connection_context()
    def get_by_tenant_ids(cls, joined_tenant_ids, user_id,
                          page_number, items_per_page,
-                          orderby, desc, keywords, canvas_category=None
+                          orderby, desc, keywords, canvas_category=CanvasCategory.Agent,
                          ):
        fields = [
            cls.model.id,
@ -135,7 +134,6 @@ class UserCanvasService(CommonService):
            cls.model.dsl,
            cls.model.description,
            cls.model.permission,
-            cls.model.user_id.alias("tenant_id"),
            User.nickname,
            User.avatar.alias('tenant_avatar'),
            cls.model.update_time,
@ -143,33 +141,31 @@ class UserCanvasService(CommonService):
        ]
        if keywords:
            agents = cls.model.select(*fields).join(User, on=(cls.model.user_id == User.id)).where(
-                cls.model.user_id.in_(joined_tenant_ids),
-                fn.LOWER(cls.model.title).contains(keywords.lower())
-                #(((cls.model.user_id.in_(joined_tenant_ids)) & (cls.model.permission == TenantPermission.TEAM.value)) | (cls.model.user_id == user_id)),
-                #(fn.LOWER(cls.model.title).contains(keywords.lower()))
+                ((cls.model.user_id.in_(joined_tenant_ids) & (cls.model.permission ==
+                                                                TenantPermission.TEAM.value)) | (
+                    cls.model.user_id == user_id)),
+                (fn.LOWER(cls.model.title).contains(keywords.lower()))
            )
        else:
            agents = cls.model.select(*fields).join(User, on=(cls.model.user_id == User.id)).where(
-                cls.model.user_id.in_(joined_tenant_ids)
-                #(((cls.model.user_id.in_(joined_tenant_ids)) & (cls.model.permission == TenantPermission.TEAM.value)) | (cls.model.user_id == user_id))
+                ((cls.model.user_id.in_(joined_tenant_ids) & (cls.model.permission ==
+                                                                TenantPermission.TEAM.value)) | (
+                    cls.model.user_id == user_id))
            )
-        if canvas_category:
-            agents = agents.where(cls.model.canvas_category == canvas_category)
+        agents = agents.where(cls.model.canvas_category == canvas_category)
        if desc:
            agents = agents.order_by(cls.model.getter_by(orderby).desc())
        else:
            agents = agents.order_by(cls.model.getter_by(orderby).asc())
-
        count = agents.count()
-        if page_number and items_per_page:
-            agents = agents.paginate(page_number, items_per_page)
+        agents = agents.paginate(page_number, items_per_page)
        return list(agents.dicts()), count

    @classmethod
    @DB.connection_context()
    def accessible(cls, canvas_id, tenant_id):
        from api.db.services.user_service import UserTenantService
-        e, c = UserCanvasService.get_by_canvas_id(canvas_id)
+        e, c = UserCanvasService.get_by_tenant_id(canvas_id)
        if not e:
            return False

--- a/api/db/services/conversation_service.py
+++ b/api/db/services/conversation_service.py
@ -48,21 +48,6 @@ class ConversationService(CommonService):

        return list(sessions.dicts())

-    @classmethod
-    @DB.connection_context()
-    def get_all_conversation_by_dialog_ids(cls, dialog_ids):
-        sessions = cls.model.select().where(cls.model.dialog_id.in_(dialog_ids))
-        sessions.order_by(cls.model.create_time.asc())
-        offset, limit = 0, 100
-        res = []
-        while True:
-            s_batch = sessions.offset(offset).limit(limit)
-            _temp = list(s_batch.dicts())
-            if not _temp:
-                break
-            res.extend(_temp)
-            offset += limit
-        return res

 def structure_answer(conv, ans, message_id, session_id):
    reference = ans["reference"]
--- a/api/db/services/dialog_service.py
+++ b/api/db/services/dialog_service.py
@ -159,22 +159,6 @@ class DialogService(CommonService):

        return list(dialogs.dicts()), count

-    @classmethod
-    @DB.connection_context()
-    def get_all_dialogs_by_tenant_id(cls, tenant_id):
-        fields = [cls.model.id]
-        dialogs = cls.model.select(*fields).where(cls.model.tenant_id == tenant_id)
-        dialogs.order_by(cls.model.create_time.asc())
-        offset, limit = 0, 100
-        res = []
-        while True:
-            d_batch = dialogs.offset(offset).limit(limit)
-            _temp = list(d_batch.dicts())
-            if not _temp:
-                break
-            res.extend(_temp)
-            offset += limit
-        return res

 def chat_solo(dialog, messages, stream=True):
    if TenantLLMService.llm_id2llm_type(dialog.llm_id) == "image2text":
--- a/api/db/services/document_service.py
+++ b/api/db/services/document_service.py
@ -24,13 +24,12 @@ from io import BytesIO

 import trio
 import xxhash
-from peewee import fn, Case, JOIN
+from peewee import fn, Case

 from api import settings
 from api.constants import IMG_BASE64_PREFIX, FILE_NAME_LEN_LIMIT
-from api.db import FileType, LLMType, ParserType, StatusEnum, TaskStatus, UserTenantRole, CanvasCategory
-from api.db.db_models import DB, Document, Knowledgebase, Task, Tenant, UserTenant, File2Document, File, UserCanvas, \
-    User
+from api.db import FileType, LLMType, ParserType, StatusEnum, TaskStatus, UserTenantRole
+from api.db.db_models import DB, Document, Knowledgebase, Task, Tenant, UserTenant, File2Document, File
 from api.db.db_utils import bulk_insert_into_db
 from api.db.services.common_service import CommonService
 from api.db.services.knowledgebase_service import KnowledgebaseService
@ -52,7 +51,6 @@ class DocumentService(CommonService):
            cls.model.thumbnail,
            cls.model.kb_id,
            cls.model.parser_id,
-            cls.model.pipeline_id,
            cls.model.parser_config,
            cls.model.source_type,
            cls.model.type,
@ -81,10 +79,7 @@ class DocumentService(CommonService):
    def get_list(cls, kb_id, page_number, items_per_page,
                 orderby, desc, keywords, id, name):
        fields = cls.get_cls_model_fields()
-        docs = cls.model.select(*[*fields, UserCanvas.title]).join(File2Document, on = (File2Document.document_id == cls.model.id))\
-            .join(File, on = (File.id == File2Document.file_id))\
-            .join(UserCanvas, on = ((cls.model.pipeline_id == UserCanvas.id) & (UserCanvas.canvas_category == CanvasCategory.DataFlow.value)), join_type=JOIN.LEFT_OUTER)\
-            .where(cls.model.kb_id == kb_id)
+        docs = cls.model.select(*fields).join(File2Document, on = (File2Document.document_id == cls.model.id)).join(File, on = (File.id == File2Document.file_id)).where(cls.model.kb_id == kb_id)
        if id:
            docs = docs.where(
                cls.model.id == id)
@ -122,22 +117,12 @@ class DocumentService(CommonService):
                     orderby, desc, keywords, run_status, types, suffix):
        fields = cls.get_cls_model_fields()
        if keywords:
-            docs = cls.model.select(*[*fields, UserCanvas.title.alias("pipeline_name"), User.nickname])\
-                .join(File2Document, on=(File2Document.document_id == cls.model.id))\
-                .join(File, on=(File.id == File2Document.file_id))\
-                .join(UserCanvas, on=(cls.model.pipeline_id == UserCanvas.id), join_type=JOIN.LEFT_OUTER)\
-                .join(User, on=(cls.model.created_by == User.id), join_type=JOIN.LEFT_OUTER)\
-                .where(
-                    (cls.model.kb_id == kb_id),
-                    (fn.LOWER(cls.model.name).contains(keywords.lower()))
-                )
+            docs = cls.model.select(*fields).join(File2Document, on=(File2Document.document_id == cls.model.id)).join(File, on=(File.id == File2Document.file_id)).where(
+                (cls.model.kb_id == kb_id),
+                (fn.LOWER(cls.model.name).contains(keywords.lower()))
+            )
        else:
-            docs = cls.model.select(*[*fields, UserCanvas.title.alias("pipeline_name"), User.nickname])\
-                .join(File2Document, on=(File2Document.document_id == cls.model.id))\
-                .join(UserCanvas, on=(cls.model.pipeline_id == UserCanvas.id), join_type=JOIN.LEFT_OUTER)\
-                .join(File, on=(File.id == File2Document.file_id))\
-                .join(User, on=(cls.model.created_by == User.id), join_type=JOIN.LEFT_OUTER)\
-                .where(cls.model.kb_id == kb_id)
+            docs = cls.model.select(*fields).join(File2Document, on=(File2Document.document_id == cls.model.id)).join(File, on=(File.id == File2Document.file_id)).where(cls.model.kb_id == kb_id)

        if run_status:
            docs = docs.where(cls.model.run.in_(run_status))
@ -243,46 +228,6 @@ class DocumentService(CommonService):

        return int(query.scalar()) or 0

-    @classmethod
-    @DB.connection_context()
-    def get_all_doc_ids_by_kb_ids(cls, kb_ids):
-        fields = [cls.model.id]
-        docs = cls.model.select(*fields).where(cls.model.kb_id.in_(kb_ids))
-        docs.order_by(cls.model.create_time.asc())
-        # maybe cause slow query by deep paginate, optimize later
-        offset, limit = 0, 100
-        res = []
-        while True:
-            doc_batch = docs.offset(offset).limit(limit)
-            _temp = list(doc_batch.dicts())
-            if not _temp:
-                break
-            res.extend(_temp)
-            offset += limit
-        return res
-
-    @classmethod
-    @DB.connection_context()
-    def get_all_docs_by_creator_id(cls, creator_id):
-        fields = [
-            cls.model.id, cls.model.kb_id, cls.model.token_num, cls.model.chunk_num, Knowledgebase.tenant_id
-        ]
-        docs = cls.model.select(*fields).join(Knowledgebase, on=(Knowledgebase.id == cls.model.kb_id)).where(
-            cls.model.created_by == creator_id
-        )
-        docs.order_by(cls.model.create_time.asc())
-        # maybe cause slow query by deep paginate, optimize later
-        offset, limit = 0, 100
-        res = []
-        while True:
-            doc_batch = docs.offset(offset).limit(limit)
-            _temp = list(doc_batch.dicts())
-            if not _temp:
-                break
-            res.extend(_temp)
-            offset += limit
-        return res
-
    @classmethod
    @DB.connection_context()
    def insert(cls, doc):
@ -385,7 +330,8 @@ class DocumentService(CommonService):
                               process_duration=cls.model.process_duration + duration).where(
            cls.model.id == doc_id).execute()
        if num == 0:
-            logging.warning("Document not found which is supposed to be there")
+            raise LookupError(
+                "Document not found which is supposed to be there")
        num = Knowledgebase.update(
            token_num=Knowledgebase.token_num +
                      token_num,
@ -651,22 +597,6 @@ class DocumentService(CommonService):
    @DB.connection_context()
    def update_progress(cls):
        docs = cls.get_unfinished_docs()
-
-        cls._sync_progress(docs)
-
-
-    @classmethod
-    @DB.connection_context()
-    def update_progress_immediately(cls, docs:list[dict]):
-        if not docs:
-            return
-
-        cls._sync_progress(docs)
-
-
-    @classmethod
-    @DB.connection_context()
-    def _sync_progress(cls, docs:list[dict]):
        for d in docs:
            try:
                tsks = Task.query(doc_id=d["id"], order_by=Task.create_time)
@ -676,6 +606,8 @@ class DocumentService(CommonService):
                prg = 0
                finished = True
                bad = 0
+                has_raptor = False
+                has_graphrag = False
                e, doc = DocumentService.get_by_id(d["id"])
                status = doc.run  # TaskStatus.RUNNING.value
                priority = 0
@ -687,14 +619,24 @@ class DocumentService(CommonService):
                    prg += t.progress if t.progress >= 0 else 0
                    if t.progress_msg.strip():
                        msg.append(t.progress_msg)
+                    if t.task_type == "raptor":
+                        has_raptor = True
+                    elif t.task_type == "graphrag":
+                        has_graphrag = True
                    priority = max(priority, t.priority)
                prg /= len(tsks)
                if finished and bad:
                    prg = -1
                    status = TaskStatus.FAIL.value
                elif finished:
-                    prg = 1
-                    status = TaskStatus.DONE.value
+                    if (d["parser_config"].get("raptor") or {}).get("use_raptor") and not has_raptor:
+                        queue_raptor_o_graphrag_tasks(d, "raptor", priority)
+                        prg = 0.98 * len(tsks) / (len(tsks) + 1)
+                    elif (d["parser_config"].get("graphrag") or {}).get("use_graphrag") and not has_graphrag:
+                        queue_raptor_o_graphrag_tasks(d, "graphrag", priority)
+                        prg = 0.98 * len(tsks) / (len(tsks) + 1)
+                    else:
+                        status = TaskStatus.DONE.value

                msg = "\n".join(sorted(msg))
                info = {
@ -706,7 +648,7 @@ class DocumentService(CommonService):
                    info["progress"] = prg
                if msg:
                    info["progress_msg"] = msg
-                    if msg.endswith("created task graphrag") or msg.endswith("created task raptor") or msg.endswith("created task mindmap"):
+                    if msg.endswith("created task graphrag") or msg.endswith("created task raptor"):
                        info["progress_msg"] += "\n%d tasks are ahead in the queue..."%get_queue_length(priority)
                else:
                    info["progress_msg"] = "%d tasks are ahead in the queue..."%get_queue_length(priority)
@ -787,11 +729,7 @@ class DocumentService(CommonService):
            "cancelled": int(cancelled),
        }

-def queue_raptor_o_graphrag_tasks(doc, ty, priority, fake_doc_id="", doc_ids=[]):
-    """
-    You can provide a fake_doc_id to bypass the restriction of tasks at the knowledgebase level.
-    Optionally, specify a list of doc_ids to determine which documents participate in the task.
-    """
+def queue_raptor_o_graphrag_tasks(doc, ty, priority):
    chunking_config = DocumentService.get_chunking_config(doc["id"])
    hasher = xxhash.xxh64()
    for field in sorted(chunking_config.keys()):
@ -801,12 +739,11 @@ def queue_raptor_o_graphrag_tasks(doc, ty, priority, fake_doc_id="", doc_ids=[])
        nonlocal doc
        return {
            "id": get_uuid(),
-            "doc_id": fake_doc_id if fake_doc_id else doc["id"],
+            "doc_id": doc["id"],
            "from_page": 100000000,
            "to_page": 100000000,
            "task_type": ty,
-            "progress_msg":  datetime.now().strftime("%H:%M:%S") + " created task " + ty,
-            "begin_at": datetime.now(),
+            "progress_msg":  datetime.now().strftime("%H:%M:%S") + " created task " + ty
        }

    task = new_task()
@ -815,12 +752,7 @@ def queue_raptor_o_graphrag_tasks(doc, ty, priority, fake_doc_id="", doc_ids=[])
    hasher.update(ty.encode("utf-8"))
    task["digest"] = hasher.hexdigest()
    bulk_insert_into_db(Task, [task], True)
-
-    if ty in ["graphrag", "raptor", "mindmap"]:
-        task["doc_ids"] = doc_ids
-        DocumentService.begin2parse(doc["id"])
    assert REDIS_CONN.queue_product(get_svr_queue_name(priority), message=task), "Can't access Redis. Please check the Redis' status."
-    return task["id"]


 def get_queue_length(priority):
--- a/api/db/services/file2document_service.py
+++ b/api/db/services/file2document_service.py
@ -38,12 +38,6 @@ class File2DocumentService(CommonService):
        objs = cls.model.select().where(cls.model.document_id == document_id)
        return objs

-    @classmethod
-    @DB.connection_context()
-    def get_by_document_ids(cls, document_ids):
-        objs = cls.model.select().where(cls.model.document_id.in_(document_ids))
-        return list(objs.dicts())
-
    @classmethod
    @DB.connection_context()
    def insert(cls, obj):
@ -56,15 +50,6 @@ class File2DocumentService(CommonService):
    def delete_by_file_id(cls, file_id):
        return cls.model.delete().where(cls.model.file_id == file_id).execute()

-    @classmethod
-    @DB.connection_context()
-    def delete_by_document_ids_or_file_ids(cls, document_ids, file_ids):
-        if not document_ids:
-            return cls.model.delete().where(cls.model.file_id.in_(file_ids)).execute()
-        elif not file_ids:
-            return cls.model.delete().where(cls.model.document_id.in_(document_ids)).execute()
-        return cls.model.delete().where(cls.model.document_id.in_(document_ids) | cls.model.file_id.in_(file_ids)).execute()
-
    @classmethod
    @DB.connection_context()
    def delete_by_document_id(cls, doc_id):
--- a/api/db/services/file_service.py
+++ b/api/db/services/file_service.py
@ -161,23 +161,6 @@ class FileService(CommonService):
            result_ids.append(folder_id)
        return result_ids

-    @classmethod
-    @DB.connection_context()
-    def get_all_file_ids_by_tenant_id(cls, tenant_id):
-        fields = [cls.model.id]
-        files = cls.model.select(*fields).where(cls.model.tenant_id == tenant_id)
-        files.order_by(cls.model.create_time.asc())
-        offset, limit = 0, 100
-        res = []
-        while True:
-            file_batch = files.offset(offset).limit(limit)
-            _temp = list(file_batch.dicts())
-            if not _temp:
-                break
-            res.extend(_temp)
-            offset += limit
-        return res
-
    @classmethod
    @DB.connection_context()
    def create_folder(cls, file, parent_id, name, count):
@ -457,7 +440,6 @@ class FileService(CommonService):
                    "id": doc_id,
                    "kb_id": kb.id,
                    "parser_id": self.get_parser(filetype, filename, kb.parser_id),
-                    "pipeline_id": kb.pipeline_id,
                    "parser_config": kb.parser_config,
                    "created_by": user_id,
                    "type": filetype,
@ -513,7 +495,7 @@ class FileService(CommonService):
            return ParserType.AUDIO.value
        if re.search(r"\.(ppt|pptx|pages)$", filename):
            return ParserType.PRESENTATION.value
-        if re.search(r"\.(msg|eml)$", filename):
+        if re.search(r"\.(eml)$", filename):
            return ParserType.EMAIL.value
        return default

--- a/api/db/services/knowledgebase_service.py
+++ b/api/db/services/knowledgebase_service.py
@ -15,10 +15,10 @@
 #
 from datetime import datetime

-from peewee import fn, JOIN
+from peewee import fn

 from api.db import StatusEnum, TenantPermission
-from api.db.db_models import DB, Document, Knowledgebase, User, UserTenant, UserCanvas
+from api.db.db_models import DB, Document, Knowledgebase, Tenant, User, UserTenant
 from api.db.services.common_service import CommonService
 from api.utils import current_timestamp, datetime_format

@ -260,29 +260,20 @@ class KnowledgebaseService(CommonService):
            cls.model.token_num,
            cls.model.chunk_num,
            cls.model.parser_id,
-            cls.model.pipeline_id,
-            UserCanvas.title.alias("pipeline_name"),
-            UserCanvas.avatar.alias("pipeline_avatar"),
            cls.model.parser_config,
            cls.model.pagerank,
-            cls.model.graphrag_task_id,
-            cls.model.graphrag_task_finish_at,
-            cls.model.raptor_task_id,
-            cls.model.raptor_task_finish_at,
-            cls.model.mindmap_task_id,
-            cls.model.mindmap_task_finish_at,
            cls.model.create_time,
            cls.model.update_time
            ]
-        kbs = cls.model.select(*fields)\
-                .join(UserCanvas, on=(cls.model.pipeline_id == UserCanvas.id), join_type=JOIN.LEFT_OUTER)\
-            .where(
+        kbs = cls.model.select(*fields).join(Tenant, on=(
+            (Tenant.id == cls.model.tenant_id) & (Tenant.status == StatusEnum.VALID.value))).where(
            (cls.model.id == kb_id),
            (cls.model.status == StatusEnum.VALID.value)
-        ).dicts()
+        )
        if not kbs:
            return
-        return kbs[0]
+        d = kbs[0].to_dict()
+        return d

    @classmethod
    @DB.connection_context()
@ -480,17 +471,3 @@ class KnowledgebaseService(CommonService):
            else:
                raise e

-    @classmethod
-    @DB.connection_context()
-    def decrease_document_num_in_delete(cls, kb_id, doc_num_info: dict):
-        kb_row = cls.model.get_by_id(kb_id)
-        if not kb_row:
-            raise RuntimeError(f"kb_id {kb_id} does not exist")
-        update_dict = {
-            'doc_num': kb_row.doc_num - doc_num_info['doc_num'],
-            'chunk_num': kb_row.chunk_num - doc_num_info['chunk_num'],
-            'token_num': kb_row.token_num - doc_num_info['token_num'],
-            'update_time': current_timestamp(),
-            'update_date': datetime_format(datetime.now())
-        }
-        return cls.model.update(update_dict).where(cls.model.id == kb_id).execute()
--- a/api/db/services/langfuse_service.py
+++ b/api/db/services/langfuse_service.py
@ -51,11 +51,6 @@ class TenantLangfuseService(CommonService):
        except peewee.DoesNotExist:
            return None

-    @classmethod
-    @DB.connection_context()
-    def delete_ty_tenant_id(cls, tenant_id):
-        return cls.model.delete().where(cls.model.tenant_id == tenant_id).execute()
-
    @classmethod
    def update_by_tenant(cls, tenant_id, langfuse_keys):
        langfuse_keys["update_time"] = current_timestamp()
--- a/api/db/services/mcp_server_service.py
+++ b/api/db/services/mcp_server_service.py
@ -84,8 +84,3 @@ class MCPServerService(CommonService):
            return bool(mcp_server), mcp_server
        except Exception:
            return False, None
-
-    @classmethod
-    @DB.connection_context()
-    def delete_by_tenant_id(cls, tenant_id: str):
-        return cls.model.delete().where(cls.model.tenant_id == tenant_id).execute()
--- a/api/db/services/pipeline_operation_log_service.py
+++ b/api/db/services/pipeline_operation_log_service.py
@ -1,263 +0,0 @@
-#
-#  Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-#
-import json
-import logging
-import os
-from datetime import datetime, timedelta
-
-from peewee import fn
-
-from api.db import VALID_PIPELINE_TASK_TYPES, PipelineTaskType
-from api.db.db_models import DB, Document, PipelineOperationLog
-from api.db.services.canvas_service import UserCanvasService
-from api.db.services.common_service import CommonService
-from api.db.services.document_service import DocumentService
-from api.db.services.knowledgebase_service import KnowledgebaseService
-from api.db.services.task_service import GRAPH_RAPTOR_FAKE_DOC_ID
-from api.utils import current_timestamp, datetime_format, get_uuid
-
-
-class PipelineOperationLogService(CommonService):
-    model = PipelineOperationLog
-
-    @classmethod
-    def get_file_logs_fields(cls):
-        return [
-            cls.model.id,
-            cls.model.document_id,
-            cls.model.tenant_id,
-            cls.model.kb_id,
-            cls.model.pipeline_id,
-            cls.model.pipeline_title,
-            cls.model.parser_id,
-            cls.model.document_name,
-            cls.model.document_suffix,
-            cls.model.document_type,
-            cls.model.source_from,
-            cls.model.progress,
-            cls.model.progress_msg,
-            cls.model.process_begin_at,
-            cls.model.process_duration,
-            cls.model.dsl,
-            cls.model.task_type,
-            cls.model.operation_status,
-            cls.model.avatar,
-            cls.model.status,
-            cls.model.create_time,
-            cls.model.create_date,
-            cls.model.update_time,
-            cls.model.update_date,
-        ]
-
-    @classmethod
-    def get_dataset_logs_fields(cls):
-        return [
-            cls.model.id,
-            cls.model.tenant_id,
-            cls.model.kb_id,
-            cls.model.progress,
-            cls.model.progress_msg,
-            cls.model.process_begin_at,
-            cls.model.process_duration,
-            cls.model.task_type,
-            cls.model.operation_status,
-            cls.model.avatar,
-            cls.model.status,
-            cls.model.create_time,
-            cls.model.create_date,
-            cls.model.update_time,
-            cls.model.update_date,
-        ]
-
-    @classmethod
-    def save(cls, **kwargs):
-        """
-        wrap this function in a transaction
-        """
-        sample_obj = cls.model(**kwargs).save(force_insert=True)
-        return sample_obj
-
-    @classmethod
-    @DB.connection_context()
-    def create(cls, document_id, pipeline_id, task_type, fake_document_ids=[], dsl: str = "{}"):
-        referred_document_id = document_id
-
-        if referred_document_id == GRAPH_RAPTOR_FAKE_DOC_ID and fake_document_ids:
-            referred_document_id = fake_document_ids[0]
-        ok, document = DocumentService.get_by_id(referred_document_id)
-        if not ok:
-            logging.warning(f"Document for referred_document_id {referred_document_id} not found")
-            return
-        DocumentService.update_progress_immediately([document.to_dict()])
-        ok, document = DocumentService.get_by_id(referred_document_id)
-        if not ok:
-            logging.warning(f"Document for referred_document_id {referred_document_id} not found")
-            return
-        if document.progress not in [1, -1]:
-            return
-        operation_status = document.run
-
-        if pipeline_id:
-            ok, user_pipeline = UserCanvasService.get_by_id(pipeline_id)
-            if not ok:
-                raise RuntimeError(f"Pipeline {pipeline_id} not found")
-            tenant_id = user_pipeline.user_id
-            title = user_pipeline.title
-            avatar = user_pipeline.avatar
-        else:
-            ok, kb_info = KnowledgebaseService.get_by_id(document.kb_id)
-            if not ok:
-                raise RuntimeError(f"Cannot find knowledge base {document.kb_id} for referred_document {referred_document_id}")
-
-            tenant_id = kb_info.tenant_id
-            title = document.parser_id
-            avatar = document.thumbnail
-
-        if task_type not in VALID_PIPELINE_TASK_TYPES:
-            raise ValueError(f"Invalid task type: {task_type}")
-
-        if task_type in [PipelineTaskType.GRAPH_RAG, PipelineTaskType.RAPTOR, PipelineTaskType.MINDMAP]:
-            finish_at = document.process_begin_at + timedelta(seconds=document.process_duration)
-            if task_type == PipelineTaskType.GRAPH_RAG:
-                KnowledgebaseService.update_by_id(
-                    document.kb_id,
-                    {"graphrag_task_finish_at": finish_at},
-                )
-            elif task_type == PipelineTaskType.RAPTOR:
-                KnowledgebaseService.update_by_id(
-                    document.kb_id,
-                    {"raptor_task_finish_at": finish_at},
-                )
-            elif task_type == PipelineTaskType.MINDMAP:
-                KnowledgebaseService.update_by_id(
-                    document.kb_id,
-                    {"mindmap_task_finish_at": finish_at},
-                )
-
-        log = dict(
-            id=get_uuid(),
-            document_id=document_id,  # GRAPH_RAPTOR_FAKE_DOC_ID or real document_id
-            tenant_id=tenant_id,
-            kb_id=document.kb_id,
-            pipeline_id=pipeline_id,
-            pipeline_title=title,
-            parser_id=document.parser_id,
-            document_name=document.name,
-            document_suffix=document.suffix,
-            document_type=document.type,
-            source_from="",  # TODO: add in the future
-            progress=document.progress,
-            progress_msg=document.progress_msg,
-            process_begin_at=document.process_begin_at,
-            process_duration=document.process_duration,
-            dsl=json.loads(dsl),
-            task_type=task_type,
-            operation_status=operation_status,
-            avatar=avatar,
-        )
-        log["create_time"] = current_timestamp()
-        log["create_date"] = datetime_format(datetime.now())
-        log["update_time"] = current_timestamp()
-        log["update_date"] = datetime_format(datetime.now())
-
-        with DB.atomic():
-            obj = cls.save(**log)
-
-            limit = int(os.getenv("PIPELINE_OPERATION_LOG_LIMIT", 1000))
-            total = cls.model.select().where(cls.model.kb_id == document.kb_id).count()
-
-            if total > limit:
-                keep_ids = [m.id for m in cls.model.select(cls.model.id).where(cls.model.kb_id == document.kb_id).order_by(cls.model.create_time.desc()).limit(limit)]
-
-                deleted = cls.model.delete().where(cls.model.kb_id == document.kb_id, cls.model.id.not_in(keep_ids)).execute()
-                logging.info(f"[PipelineOperationLogService] Cleaned {deleted} old logs, kept latest {limit} for {document.kb_id}")
-
-        return obj
-
-    @classmethod
-    @DB.connection_context()
-    def record_pipeline_operation(cls, document_id, pipeline_id, task_type, fake_document_ids=[]):
-        return cls.create(document_id=document_id, pipeline_id=pipeline_id, task_type=task_type, fake_document_ids=fake_document_ids)
-
-    @classmethod
-    @DB.connection_context()
-    def get_file_logs_by_kb_id(cls, kb_id, page_number, items_per_page, orderby, desc, keywords, operation_status, types, suffix, create_date_from=None, create_date_to=None):
-        fields = cls.get_file_logs_fields()
-        if keywords:
-            logs = cls.model.select(*fields).where((cls.model.kb_id == kb_id), (fn.LOWER(cls.model.document_name).contains(keywords.lower())))
-        else:
-            logs = cls.model.select(*fields).where(cls.model.kb_id == kb_id)
-
-        logs = logs.where(cls.model.document_id != GRAPH_RAPTOR_FAKE_DOC_ID)
-
-        if operation_status:
-            logs = logs.where(cls.model.operation_status.in_(operation_status))
-        if types:
-            logs = logs.where(cls.model.document_type.in_(types))
-        if suffix:
-            logs = logs.where(cls.model.document_suffix.in_(suffix))
-        if create_date_from:
-            logs = logs.where(cls.model.create_date >= create_date_from)
-        if create_date_to:
-            logs = logs.where(cls.model.create_date <= create_date_to)
-
-        count = logs.count()
-        if desc:
-            logs = logs.order_by(cls.model.getter_by(orderby).desc())
-        else:
-            logs = logs.order_by(cls.model.getter_by(orderby).asc())
-
-        if page_number and items_per_page:
-            logs = logs.paginate(page_number, items_per_page)
-
-        return list(logs.dicts()), count
-
-    @classmethod
-    @DB.connection_context()
-    def get_documents_info(cls, id):
-        fields = [Document.id, Document.name, Document.progress, Document.kb_id]
-        return (
-            cls.model.select(*fields)
-            .join(Document, on=(cls.model.document_id == Document.id))
-            .where(
-                cls.model.id == id
-            )
-            .dicts()
-        )
-
-    @classmethod
-    @DB.connection_context()
-    def get_dataset_logs_by_kb_id(cls, kb_id, page_number, items_per_page, orderby, desc, operation_status, create_date_from=None, create_date_to=None):
-        fields = cls.get_dataset_logs_fields()
-        logs = cls.model.select(*fields).where((cls.model.kb_id == kb_id), (cls.model.document_id == GRAPH_RAPTOR_FAKE_DOC_ID))
-
-        if operation_status:
-            logs = logs.where(cls.model.operation_status.in_(operation_status))
-        if create_date_from:
-            logs = logs.where(cls.model.create_date >= create_date_from)
-        if create_date_to:
-            logs = logs.where(cls.model.create_date <= create_date_to)
-
-        count = logs.count()
-        if desc:
-            logs = logs.order_by(cls.model.getter_by(orderby).desc())
-        else:
-            logs = logs.order_by(cls.model.getter_by(orderby).asc())
-
-        if page_number and items_per_page:
-            logs = logs.paginate(page_number, items_per_page)
-
-        return list(logs.dicts()), count
--- a/api/db/services/search_service.py
+++ b/api/db/services/search_service.py
@ -110,8 +110,3 @@ class SearchService(CommonService):
            query = query.paginate(page_number, items_per_page)

        return list(query.dicts()), count
-
-    @classmethod
-    @DB.connection_context()
-    def delete_by_tenant_id(cls, tenant_id):
-        return cls.model.delete().where(cls.model.tenant_id == tenant_id).execute()
--- a/api/db/services/task_service.py
+++ b/api/db/services/task_service.py
@ -35,8 +35,6 @@ from rag.utils.redis_conn import REDIS_CONN
 from api import settings
 from rag.nlp import search

-CANVAS_DEBUG_DOC_ID = "dataflow_x"
-GRAPH_RAPTOR_FAKE_DOC_ID = "graph_raptor_x"

 def trim_header_by_lines(text: str, max_length) -> str:
    # Trim header text to maximum length while preserving line breaks
@ -72,7 +70,7 @@ class TaskService(CommonService):

    @classmethod
    @DB.connection_context()
-    def get_task(cls, task_id, doc_ids=[]):
+    def get_task(cls, task_id):
        """Retrieve detailed task information by task ID.

        This method fetches comprehensive task details including associated document,
@ -86,10 +84,6 @@ class TaskService(CommonService):
            dict: Task details dictionary containing all task information and related metadata.
                 Returns None if task is not found or has exceeded retry limit.
        """
-        doc_id = cls.model.doc_id
-        if doc_id == CANVAS_DEBUG_DOC_ID and doc_ids:
-            doc_id = doc_ids[0]
-
        fields = [
            cls.model.id,
            cls.model.doc_id,
@ -115,7 +109,7 @@ class TaskService(CommonService):
        ]
        docs = (
            cls.model.select(*fields)
-                .join(Document, on=(doc_id == Document.id))
+                .join(Document, on=(cls.model.doc_id == Document.id))
                .join(Knowledgebase, on=(Document.kb_id == Knowledgebase.id))
                .join(Tenant, on=(Knowledgebase.tenant_id == Tenant.id))
                .where(cls.model.id == task_id)
@ -298,29 +292,21 @@ class TaskService(CommonService):
                        ((prog == -1) | (prog > cls.model.progress))
                    )
                ).execute()
-        else:
-            with DB.lock("update_progress", -1):
-                if info["progress_msg"]:
-                    progress_msg = trim_header_by_lines(task.progress_msg + "\n" + info["progress_msg"], 3000)
-                    cls.model.update(progress_msg=progress_msg).where(cls.model.id == id).execute()
-                if "progress" in info:
-                    prog = info["progress"]
-                    cls.model.update(progress=prog).where(
-                        (cls.model.id == id) &
-                        (
-                            (cls.model.progress != -1) &
-                            ((prog == -1) | (prog > cls.model.progress))
-                        )
-                    ).execute()
+            return

-        process_duration = (datetime.now() - task.begin_at).total_seconds()
-        cls.model.update(process_duration=process_duration).where(cls.model.id == id).execute()
-
-    @classmethod
-    @DB.connection_context()
-    def delete_by_doc_ids(cls, doc_ids):
-        """Delete task associated with a document."""
-        return cls.model.delete().where(cls.model.doc_id.in_(doc_ids)).execute()
+        with DB.lock("update_progress", -1):
+            if info["progress_msg"]:
+                progress_msg = trim_header_by_lines(task.progress_msg + "\n" + info["progress_msg"], 3000)
+                cls.model.update(progress_msg=progress_msg).where(cls.model.id == id).execute()
+            if "progress" in info:
+                prog = info["progress"]
+                cls.model.update(progress=prog).where(
+                    (cls.model.id == id) &
+                    (
+                        (cls.model.progress != -1) &
+                        ((prog == -1) | (prog > cls.model.progress))
+                    )
+                ).execute()


 def queue_tasks(doc: dict, bucket: str, name: str, priority: int):
@ -344,14 +330,7 @@ def queue_tasks(doc: dict, bucket: str, name: str, priority: int):
        - Previous task chunks may be reused if available
    """
    def new_task():
-        return {
-            "id": get_uuid(),
-            "doc_id": doc["id"],
-            "progress": 0.0,
-            "from_page": 0,
-            "to_page": 100000000,
-            "begin_at": datetime.now(),
-        }
+        return {"id": get_uuid(), "doc_id": doc["id"], "progress": 0.0, "from_page": 0, "to_page": 100000000}

    parse_task_array = []

@ -364,7 +343,7 @@ def queue_tasks(doc: dict, bucket: str, name: str, priority: int):
        page_size = doc["parser_config"].get("task_page_size") or 12
        if doc["parser_id"] == "paper":
            page_size = doc["parser_config"].get("task_page_size") or 22
-        if doc["parser_id"] in ["one", "knowledge_graph"] or do_layout != "DeepDOC" or doc["parser_config"].get("toc", True):
+        if doc["parser_id"] in ["one", "knowledge_graph"] or do_layout != "DeepDOC":
            page_size = 10 ** 9
        page_ranges = doc["parser_config"].get("pages") or [(1, 10 ** 5)]
        for s, e in page_ranges:
@ -493,26 +472,33 @@ def has_canceled(task_id):
    return False


-def queue_dataflow(tenant_id:str, flow_id:str, task_id:str, doc_id:str=CANVAS_DEBUG_DOC_ID, file:dict=None, priority: int=0, rerun:bool=False) -> tuple[bool, str]:
+def queue_dataflow(dsl:str, tenant_id:str, doc_id:str, task_id:str, flow_id:str, priority: int, callback=None) -> tuple[bool, str]:
+    """
+    Returns a tuple (success: bool, error_message: str).
+    """
+    _ = callback

    task = dict(
-        id=task_id,
-        doc_id=doc_id,
-        from_page=0,
-        to_page=100000000,
-        task_type="dataflow" if not rerun else "dataflow_rerun",
-        priority=priority,
-        begin_at=datetime.now(),
+    id=get_uuid() if not task_id else task_id,
+    doc_id=doc_id,
+    from_page=0,
+    to_page=100000000,
+    task_type="dataflow",
+    priority=priority,
    )
-    if doc_id not in [CANVAS_DEBUG_DOC_ID, GRAPH_RAPTOR_FAKE_DOC_ID]:
-        TaskService.model.delete().where(TaskService.model.doc_id == doc_id).execute()
-        DocumentService.begin2parse(doc_id)
+
+    TaskService.model.delete().where(TaskService.model.id == task["id"]).execute()
    bulk_insert_into_db(model=Task, data_source=[task], replace_on_conflict=True)

-    task["kb_id"] = DocumentService.get_knowledgebase_id(doc_id)
+    kb_id = DocumentService.get_knowledgebase_id(doc_id)
+    if not kb_id:
+        return False, f"Can't find KB of this document: {doc_id}"
+
+    task["kb_id"] = kb_id
    task["tenant_id"] = tenant_id
-    task["dataflow_id"] = flow_id
-    task["file"] = file
+    task["task_type"] = "dataflow"
+    task["dsl"] = dsl
+    task["dataflow_id"] = get_uuid() if not flow_id else flow_id

    if not REDIS_CONN.queue_product(
        get_svr_queue_name(priority), message=task
--- a/api/db/services/tenant_llm_service.py
+++ b/api/db/services/tenant_llm_service.py
@ -209,11 +209,6 @@ class TenantLLMService(CommonService):
        objs = cls.model.select().where((cls.model.llm_factory == "OpenAI"), ~(cls.model.llm_name == "text-embedding-3-small"), ~(cls.model.llm_name == "text-embedding-3-large")).dicts()
        return list(objs)

-    @classmethod
-    @DB.connection_context()
-    def delete_by_tenant_id(cls, tenant_id):
-        return cls.model.delete().where(cls.model.tenant_id == tenant_id).execute()
-
    @staticmethod
    def llm_id2llm_type(llm_id: str) -> str | None:
        from api.db.services.llm_service import LLMService
--- a/api/db/services/user_canvas_version.py
+++ b/api/db/services/user_canvas_version.py
@ -24,24 +24,7 @@ class UserCanvasVersionService(CommonService):
            return None
        except Exception:
            return None
-
-    @classmethod
-    @DB.connection_context()
-    def get_all_canvas_version_by_canvas_ids(cls, canvas_ids):
-        fields = [cls.model.id]
-        versions = cls.model.select(*fields).where(cls.model.user_canvas_id.in_(canvas_ids))
-        versions.order_by(cls.model.create_time.asc())
-        offset, limit = 0, 100
-        res = []
-        while True:
-            version_batch = versions.offset(offset).limit(limit)
-            _temp = list(version_batch.dicts())
-            if not _temp:
-                break
-            res.extend(_temp)
-            offset += limit
-        return res
-
+    
    @classmethod
    @DB.connection_context()
    def delete_all_versions(cls, user_canvas_id):
--- a/api/db/services/user_service.py
+++ b/api/db/services/user_service.py
@ -288,17 +288,6 @@ class UserTenantService(CommonService):
                    .join(User, on=((cls.model.tenant_id == User.id) & (UserTenant.user_id == user_id) & (UserTenant.status == StatusEnum.VALID.value)))
                    .where(cls.model.status == StatusEnum.VALID.value).dicts())

-    @classmethod
-    @DB.connection_context()
-    def get_user_tenant_relation_by_user_id(cls, user_id):
-        fields = [
-            cls.model.id,
-            cls.model.user_id,
-            cls.model.tenant_id,
-            cls.model.role
-        ]
-        return list(cls.model.select(*fields).where(cls.model.user_id == user_id).dicts().dicts())
-
    @classmethod
    @DB.connection_context()
    def get_num_members(cls, user_id: str):
--- a/api/utils/api_utils.py
+++ b/api/utils/api_utils.py
@ -659,16 +659,6 @@ def remap_dictionary_keys(source_data: dict, key_aliases: dict = None) -> dict:
    return transformed_data


-def group_by(list_of_dict, key):
-    res = {}
-    for item in list_of_dict:
-        if item[key] in res.keys():
-            res[item[key]].append(item)
-        else:
-            res[item[key]] = [item]
-    return res
-
-
 def get_mcp_tools(mcp_servers: list, timeout: float | int = 10) -> tuple[dict, str]:
    results = {}
    tool_call_sessions = []
@ -705,9 +695,7 @@ TimeoutException = Union[Type[BaseException], BaseException]
 OnTimeoutCallback = Union[Callable[..., Any], Coroutine[Any, Any, Any]]


-def timeout(seconds: float | int | str = None, attempts: int = 2, *, exception: Optional[TimeoutException] = None, on_timeout: Optional[OnTimeoutCallback] = None):
-    if isinstance(seconds, str):
-        seconds = float(seconds)
+def timeout(seconds: float | int = None, attempts: int = 2, *, exception: Optional[TimeoutException] = None, on_timeout: Optional[OnTimeoutCallback] = None):
    def decorator(func):
        @wraps(func)
        def wrapper(*args, **kwargs):
--- a/api/utils/base64_image.py
+++ b/api/utils/base64_image.py
@ -1,56 +1,3 @@
 import base64
-import logging
-from functools import partial
-from io import BytesIO
-
-from PIL import Image
-
 test_image_base64 = "iVBORw0KGgoAAAANSUhEUgAAAGQAAABkCAIAAAD/gAIDAAAA6ElEQVR4nO3QwQ3AIBDAsIP9d25XIC+EZE8QZc18w5l9O+AlZgVmBWYFZgVmBWYFZgVmBWYFZgVmBWYFZgVmBWYFZgVmBWYFZgVmBWYFZgVmBWYFZgVmBWYFZgVmBWYFZgVmBWYFZgVmBWYFZgVmBWYFZgVmBWYFZgVmBWYFZgVmBWYFZgVmBWYFZgVmBWYFZgVmBWYFZgVmBWYFZgVmBWYFZgVmBWYFZgVmBWYFZgVmBWYFZgVmBWYFZgVmBWYFZgVmBWYFZgVmBWYFZgVmBWYFZgVmBWYFZgVmBWYFZgVmBWYFZgVmBT+IYAHHLHkdEgAAAABJRU5ErkJggg=="
-test_image = base64.b64decode(test_image_base64)
-
-
-async def image2id(d: dict, storage_put_func: partial, objname:str, bucket:str="imagetemps"):
-    import logging
-    from io import BytesIO
-    import trio
-    from rag.svr.task_executor import minio_limiter
-    if not d.get("image"):
-        return
-
-    with BytesIO() as output_buffer:
-        if isinstance(d["image"], bytes):
-            output_buffer.write(d["image"])
-            output_buffer.seek(0)
-        else:
-            # If the image is in RGBA mode, convert it to RGB mode before saving it in JPEG format.
-            if d["image"].mode in ("RGBA", "P"):
-                converted_image = d["image"].convert("RGB")
-                d["image"] = converted_image
-            try:
-                d["image"].save(output_buffer, format='JPEG')
-            except OSError as e:
-                logging.warning(
-                    "Saving image exception, ignore: {}".format(str(e)))
-
-        async with minio_limiter:
-            await trio.to_thread.run_sync(lambda: storage_put_func(bucket=bucket, fnm=objname, binary=output_buffer.getvalue()))
-        d["img_id"] = f"{bucket}-{objname}"
-        if not isinstance(d["image"], bytes):
-            d["image"].close()
-        del d["image"]  # Remove image reference
-
-
-def id2image(image_id:str|None, storage_get_func: partial):
-    if not image_id:
-        return
-    arr = image_id.split("-")
-    if len(arr) != 2:
-        return
-    bkt, nm = image_id.split("-")
-    try:
-        blob = storage_get_func(bucket=bkt, filename=nm)
-        if not blob:
-            return
-        return Image.open(BytesIO(blob))
-    except Exception as e:
-        logging.exception(e)
+test_image = base64.b64decode(test_image_base64)
--- a/api/utils/file_utils.py
+++ b/api/utils/file_utils.py
@ -155,7 +155,7 @@ def filename_type(filename):
    if re.match(r".*\.pdf$", filename):
        return FileType.PDF.value

-    if re.match(r".*\.(msg|eml|doc|docx|ppt|pptx|yml|xml|htm|json|jsonl|ldjson|csv|txt|ini|xls|xlsx|wps|rtf|hlp|pages|numbers|key|md|py|js|java|c|cpp|h|php|go|ts|sh|cs|kt|html|sql)$", filename):
+    if re.match(r".*\.(eml|doc|docx|ppt|pptx|yml|xml|htm|json|jsonl|ldjson|csv|txt|ini|xls|xlsx|wps|rtf|hlp|pages|numbers|key|md|py|js|java|c|cpp|h|php|go|ts|sh|cs|kt|html|sql)$", filename):
        return FileType.DOC.value

    if re.match(r".*\.(wav|flac|ape|alac|wavpack|wv|mp3|aac|ogg|vorbis|opus)$", filename):
--- a/api/utils/health.py
+++ b/api/utils/health.py
@ -1,104 +0,0 @@
-from timeit import default_timer as timer
-
-from api import settings
-from api.db.db_models import DB
-from rag.utils.redis_conn import REDIS_CONN
-from rag.utils.storage_factory import STORAGE_IMPL
-
-
-def _ok_nok(ok: bool) -> str:
-    return "ok" if ok else "nok"
-
-
-def check_db() -> tuple[bool, dict]:
-    st = timer()
-    try:
-        # lightweight probe; works for MySQL/Postgres
-        DB.execute_sql("SELECT 1")
-        return True, {"elapsed": f"{(timer() - st) * 1000.0:.1f}"}
-    except Exception as e:
-        return False, {"elapsed": f"{(timer() - st) * 1000.0:.1f}", "error": str(e)}
-
-
-def check_redis() -> tuple[bool, dict]:
-    st = timer()
-    try:
-        ok = bool(REDIS_CONN.health())
-        return ok, {"elapsed": f"{(timer() - st) * 1000.0:.1f}"}
-    except Exception as e:
-        return False, {"elapsed": f"{(timer() - st) * 1000.0:.1f}", "error": str(e)}
-
-
-def check_doc_engine() -> tuple[bool, dict]:
-    st = timer()
-    try:
-        meta = settings.docStoreConn.health()
-        # treat any successful call as ok
-        return True, {"elapsed": f"{(timer() - st) * 1000.0:.1f}", **(meta or {})}
-    except Exception as e:
-        return False, {"elapsed": f"{(timer() - st) * 1000.0:.1f}", "error": str(e)}
-
-
-def check_storage() -> tuple[bool, dict]:
-    st = timer()
-    try:
-        STORAGE_IMPL.health()
-        return True, {"elapsed": f"{(timer() - st) * 1000.0:.1f}"}
-    except Exception as e:
-        return False, {"elapsed": f"{(timer() - st) * 1000.0:.1f}", "error": str(e)}
-
-
-def check_chat() -> tuple[bool, dict]:
-    st = timer()
-    try:
-        cfg = getattr(settings, "CHAT_CFG", None)
-        ok = bool(cfg and cfg.get("factory"))
-        return ok, {"elapsed": f"{(timer() - st) * 1000.0:.1f}"}
-    except Exception as e:
-        return False, {"elapsed": f"{(timer() - st) * 1000.0:.1f}", "error": str(e)}
-
-
-def run_health_checks() -> tuple[dict, bool]:
-    result: dict[str, str | dict] = {}
-
-    db_ok, db_meta = check_db()
-    chat_ok, chat_meta = check_chat()
-
-    result["db"] = _ok_nok(db_ok)
-    if not db_ok:
-        result.setdefault("_meta", {})["db"] = db_meta
-
-    result["chat"] = _ok_nok(chat_ok)
-    if not chat_ok:
-        result.setdefault("_meta", {})["chat"] = chat_meta
-
-    # Optional probes (do not change minimal contract but exposed for observability)
-    try:
-        redis_ok, redis_meta = check_redis()
-        result["redis"] = _ok_nok(redis_ok)
-        if not redis_ok:
-            result.setdefault("_meta", {})["redis"] = redis_meta
-    except Exception:
-        result["redis"] = "nok"
-
-    try:
-        doc_ok, doc_meta = check_doc_engine()
-        result["doc_engine"] = _ok_nok(doc_ok)
-        if not doc_ok:
-            result.setdefault("_meta", {})["doc_engine"] = doc_meta
-    except Exception:
-        result["doc_engine"] = "nok"
-
-    try:
-        sto_ok, sto_meta = check_storage()
-        result["storage"] = _ok_nok(sto_ok)
-        if not sto_ok:
-            result.setdefault("_meta", {})["storage"] = sto_meta
-    except Exception:
-        result["storage"] = "nok"
-
-    all_ok = (result.get("db") == "ok") and (result.get("chat") == "ok")
-    result["status"] = "ok" if all_ok else "nok"
-    return result, all_ok
-
-
--- a/deepdoc/parser/pdf_parser.py
+++ b/deepdoc/parser/pdf_parser.py
@ -1075,10 +1075,11 @@ class RAGFlowPdfParser:
        def insert_table_figures(tbls_or_figs, layout_type):
            def min_rectangle_distance(rect1, rect2):
                import math
+
                pn1, left1, right1, top1, bottom1 = rect1
                pn2, left2, right2, top2, bottom2 = rect2
                if right1 >= left2 and right2 >= left1 and bottom1 >= top2 and bottom2 >= top1:
-                    return 0
+                    return 0 + (pn1 - pn2) * 10000
                if right1 < left2:
                    dx = left2 - right1
                elif right2 < left1:
@ -1091,27 +1092,20 @@ class RAGFlowPdfParser:
                    dy = top1 - bottom2
                else:
                    dy = 0
-                return math.sqrt(dx*dx + dy*dy)# + (pn2-pn1)*10000
+                return math.sqrt(dx * dx + dy * dy) + (pn1 - pn2) * 10000

            for (img, txt), poss in tbls_or_figs:
                bboxes = [(i, (b["page_number"], b["x0"], b["x1"], b["top"], b["bottom"])) for i, b in enumerate(self.boxes)]
-                dists = [(min_rectangle_distance((pn, left, right, top+self.page_cum_height[pn], bott+self.page_cum_height[pn]), rect),i) for i, rect in bboxes for pn, left, right, top, bott in poss]
+                dists = [(min_rectangle_distance((pn, left, right, top, bott), rect), i) for i, rect in bboxes for pn, left, right, top, bott in poss]
                min_i = np.argmin(dists, axis=0)[0]
                min_i, rect = bboxes[dists[min_i][-1]]
                if isinstance(txt, list):
                    txt = "\n".join(txt)
-                pn, left, right, top, bott = poss[0]
-                if self.boxes[min_i]["bottom"] < top+self.page_cum_height[pn]:
-                    min_i += 1
-                self.boxes.insert(min_i, {
-                    "page_number": pn+1, "x0": left, "x1": right, "top": top+self.page_cum_height[pn], "bottom": bott+self.page_cum_height[pn], "layout_type": layout_type, "text": txt, "image": img,
-                    "positions": [[pn+1, int(left), int(right), int(top), int(bott)]]
-                })
+                self.boxes.insert(min_i, {"page_number": rect[0], "x0": rect[1], "x1": rect[2], "top": rect[3], "bottom": rect[4], "layout_type": layout_type, "text": txt, "image": img})

        for b in self.boxes:
            b["position_tag"] = self._line_tag(b, zoomin)
            b["image"] = self.crop(b["position_tag"], zoomin)
-            b["positions"] = [[pos[0][-1]+1, *pos[1:]] for pos in RAGFlowPdfParser.extract_positions(b["position_tag"])]

        insert_table_figures(tbls, "table")
        insert_table_figures(figs, "figure")
@ -1129,7 +1123,7 @@ class RAGFlowPdfParser:
        for tag in re.findall(r"@@[0-9-]+\t[0-9.\t]+##", txt):
            pn, left, right, top, bottom = tag.strip("#").strip("@").split("\t")
            left, right, top, bottom = float(left), float(right), float(top), float(bottom)
-            poss.append(([int(p) - 1 for p in pn.split("-")], int(left), int(right), int(top), int(bottom)))
+            poss.append(([int(p) - 1 for p in pn.split("-")], left, right, top, bottom))
        return poss

    def crop(self, text, ZM=3, need_position=False):
--- a/docs/references/supported_models.mdx
+++ b/docs/references/supported_models.mdx
@ -65,7 +65,7 @@ A complete list of models supported by RAGFlow, which will continue to expand.
 | 01.AI                 | :heavy_check_mark: |                    |                    |                    |                    |                    |
 | DeepInfra             | :heavy_check_mark: | :heavy_check_mark: |                    |                    | :heavy_check_mark: | :heavy_check_mark: |
 | 302.AI                | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: |                    |                    |
-| CometAPI              | :heavy_check_mark: | :heavy_check_mark: |                    |                    |                    |                    |
+| CometAPI              | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: |                    |                    |

 ```mdx-code-block
 </APITable>
--- a/graphrag/general/index.py
+++ b/graphrag/general/index.py
@ -21,7 +21,6 @@ import networkx as nx
 import trio

 from api import settings
-from api.db.services.document_service import DocumentService
 from api.utils import get_uuid
 from api.utils.api_utils import timeout
 from graphrag.entity_resolution import EntityResolution
@ -55,7 +54,7 @@ async def run_graphrag(
    start = trio.current_time()
    tenant_id, kb_id, doc_id = row["tenant_id"], str(row["kb_id"]), row["doc_id"]
    chunks = []
-    for d in settings.retrievaler.chunk_list(doc_id, tenant_id, [kb_id], fields=["content_with_weight", "doc_id"], sort_by_position=True):
+    for d in settings.retrievaler.chunk_list(doc_id, tenant_id, [kb_id], fields=["content_with_weight", "doc_id"]):
        chunks.append(d["content_with_weight"])

    with trio.fail_after(max(120, len(chunks) * 60 * 10) if enable_timeout_assertion else 10000000000):
@ -126,212 +125,6 @@ async def run_graphrag(
    return


-async def run_graphrag_for_kb(
-    row: dict,
-    doc_ids: list[str],
-    language: str,
-    kb_parser_config: dict,
-    chat_model,
-    embedding_model,
-    callback,
-    *,
-    with_resolution: bool = True,
-    with_community: bool = True,
-    max_parallel_docs: int = 4,
-) -> dict:
-    tenant_id, kb_id = row["tenant_id"], row["kb_id"]
-    enable_timeout_assertion = os.environ.get("ENABLE_TIMEOUT_ASSERTION")
-    start = trio.current_time()
-    fields_for_chunks = ["content_with_weight", "doc_id"]
-
-    if not doc_ids:
-        logging.info(f"Fetching all docs for {kb_id}")
-        docs, _ = DocumentService.get_by_kb_id(
-            kb_id=kb_id,
-            page_number=0,
-            items_per_page=0,
-            orderby="create_time",
-            desc=False,
-            keywords="",
-            run_status=[],
-            types=[],
-            suffix=[],
-        )
-        doc_ids = [doc["id"] for doc in docs]
-
-    doc_ids = list(dict.fromkeys(doc_ids))
-    if not doc_ids:
-        callback(msg=f"[GraphRAG] kb:{kb_id} has no processable doc_id.")
-        return {"ok_docs": [], "failed_docs": [], "total_docs": 0, "total_chunks": 0, "seconds": 0.0}
-
-    def load_doc_chunks(doc_id: str) -> list[str]:
-        from rag.utils import num_tokens_from_string
-
-        chunks = []
-        current_chunk = ""
-
-        for d in settings.retrievaler.chunk_list(
-            doc_id,
-            tenant_id,
-            [kb_id],
-            fields=fields_for_chunks,
-            sort_by_position=True,
-        ):
-            content = d["content_with_weight"]
-            if num_tokens_from_string(current_chunk + content) < 1024:
-                current_chunk += content
-            else:
-                if current_chunk:
-                    chunks.append(current_chunk)
-                current_chunk = content
-
-        if current_chunk:
-            chunks.append(current_chunk)
-
-        return chunks
-
-    all_doc_chunks: dict[str, list[str]] = {}
-    total_chunks = 0
-    for doc_id in doc_ids:
-        chunks = load_doc_chunks(doc_id)
-        all_doc_chunks[doc_id] = chunks
-        total_chunks += len(chunks)
-
-    if total_chunks == 0:
-        callback(msg=f"[GraphRAG] kb:{kb_id} has no available chunks in all documents, skip.")
-        return {"ok_docs": [], "failed_docs": doc_ids, "total_docs": len(doc_ids), "total_chunks": 0, "seconds": 0.0}
-
-    semaphore = trio.Semaphore(max_parallel_docs)
-
-    subgraphs: dict[str, object] = {}
-    failed_docs: list[tuple[str, str]] = []  # (doc_id, error)
-
-    async def build_one(doc_id: str):
-        chunks = all_doc_chunks.get(doc_id, [])
-        if not chunks:
-            callback(msg=f"[GraphRAG] doc:{doc_id} has no available chunks, skip generation.")
-            return
-
-        kg_extractor = LightKGExt if ("method" not in kb_parser_config.get("graphrag", {}) or kb_parser_config["graphrag"]["method"] != "general") else GeneralKGExt
-
-        deadline = max(120, len(chunks) * 60 * 10) if enable_timeout_assertion else 10000000000
-
-        async with semaphore:
-            try:
-                msg = f"[GraphRAG] build_subgraph doc:{doc_id}"
-                callback(msg=f"{msg} start (chunks={len(chunks)}, timeout={deadline}s)")
-                with trio.fail_after(deadline):
-                    sg = await generate_subgraph(
-                        kg_extractor,
-                        tenant_id,
-                        kb_id,
-                        doc_id,
-                        chunks,
-                        language,
-                        kb_parser_config.get("graphrag", {}).get("entity_types", []),
-                        chat_model,
-                        embedding_model,
-                        callback,
-                    )
-                if sg:
-                    subgraphs[doc_id] = sg
-                    callback(msg=f"{msg} done")
-                else:
-                    failed_docs.append((doc_id, "subgraph is empty"))
-                    callback(msg=f"{msg} empty")
-            except Exception as e:
-                failed_docs.append((doc_id, repr(e)))
-                callback(msg=f"[GraphRAG] build_subgraph doc:{doc_id} FAILED: {e!r}")
-
-    async with trio.open_nursery() as nursery:
-        for doc_id in doc_ids:
-            nursery.start_soon(build_one, doc_id)
-
-    ok_docs = [d for d in doc_ids if d in subgraphs]
-    if not ok_docs:
-        callback(msg=f"[GraphRAG] kb:{kb_id} no subgraphs generated successfully, end.")
-        now = trio.current_time()
-        return {"ok_docs": [], "failed_docs": failed_docs, "total_docs": len(doc_ids), "total_chunks": total_chunks, "seconds": now - start}
-
-    kb_lock = RedisDistributedLock(f"graphrag_task_{kb_id}", lock_value="batch_merge", timeout=1200)
-    await kb_lock.spin_acquire()
-    callback(msg=f"[GraphRAG] kb:{kb_id} merge lock acquired")
-
-    try:
-        union_nodes: set = set()
-        final_graph = None
-
-        for doc_id in ok_docs:
-            sg = subgraphs[doc_id]
-            union_nodes.update(set(sg.nodes()))
-
-            new_graph = await merge_subgraph(
-                tenant_id,
-                kb_id,
-                doc_id,
-                sg,
-                embedding_model,
-                callback,
-            )
-            if new_graph is not None:
-                final_graph = new_graph
-
-        if final_graph is None:
-            callback(msg=f"[GraphRAG] kb:{kb_id} merge finished (no in-memory graph returned).")
-        else:
-            callback(msg=f"[GraphRAG] kb:{kb_id} merge finished, graph ready.")
-    finally:
-        kb_lock.release()
-
-    if not with_resolution and not with_community:
-        now = trio.current_time()
-        callback(msg=f"[GraphRAG] KB merge done in {now - start:.2f}s. ok={len(ok_docs)} / total={len(doc_ids)}")
-        return {"ok_docs": ok_docs, "failed_docs": failed_docs, "total_docs": len(doc_ids), "total_chunks": total_chunks, "seconds": now - start}
-
-    await kb_lock.spin_acquire()
-    callback(msg=f"[GraphRAG] kb:{kb_id} post-merge lock acquired for resolution/community")
-
-    try:
-        subgraph_nodes = set()
-        for sg in subgraphs.values():
-            subgraph_nodes.update(set(sg.nodes()))
-
-        if with_resolution:
-            await resolve_entities(
-                final_graph,
-                subgraph_nodes,
-                tenant_id,
-                kb_id,
-                None,
-                chat_model,
-                embedding_model,
-                callback,
-            )
-
-        if with_community:
-            await extract_community(
-                final_graph,
-                tenant_id,
-                kb_id,
-                None,
-                chat_model,
-                embedding_model,
-                callback,
-            )
-    finally:
-        kb_lock.release()
-
-    now = trio.current_time()
-    callback(msg=f"[GraphRAG] GraphRAG for KB {kb_id} done in {now - start:.2f} seconds. ok={len(ok_docs)} failed={len(failed_docs)} total_docs={len(doc_ids)} total_chunks={total_chunks}")
-    return {
-        "ok_docs": ok_docs,
-        "failed_docs": failed_docs,  # [(doc_id, error), ...]
-        "total_docs": len(doc_ids),
-        "total_chunks": total_chunks,
-        "seconds": now - start,
-    }
-
-
 async def generate_subgraph(
    extractor: Extractor,
    tenant_id: str,
--- a/pyproject.toml
+++ b/pyproject.toml
@ -34,7 +34,6 @@ dependencies = [
    "elastic-transport==8.12.0",
    "elasticsearch==8.12.1",
    "elasticsearch-dsl==8.12.0",
-    "extract-msg>=0.39.0",
    "filelock==3.15.4",
    "flask==3.0.3",
    "flask-cors==5.0.0",
@ -158,9 +157,6 @@ test = [
    "requests-toolbelt>=1.0.0",
 ]

-[[tool.uv.index]]
-url = "https://mirrors.aliyun.com/pypi/simple"
-
 [tool.setuptools]
 packages = [
    'agent',
@ -174,6 +170,9 @@ packages = [
    'sdk.python.ragflow_sdk',
 ]

+[[tool.uv.index]]
+url = "https://mirrors.aliyun.com/pypi/simple"
+
 [tool.ruff]
 line-length = 200
 exclude = [".venv", "rag/svr/discord_svr.py"]
--- a/rag/app/email.py
+++ b/rag/app/email.py
@ -78,7 +78,7 @@ def chunk(
    _add_content(msg, msg.get_content_type())

    sections = TxtParser.parser_txt("\n".join(text_txt)) + [
-        (line, "") for line in HtmlParser.parser_txt("\n".join(html_txt), chunk_token_num=parser_config["chunk_token_num"]) if line
+        (line, "") for line in HtmlParser.parser_txt("\n".join(html_txt)) if line
    ]

    st = timer()
--- a/rag/flow/base.py
+++ b/rag/flow/base.py
@ -18,7 +18,9 @@ import os
 import time
 from functools import partial
 from typing import Any
+
 import trio
+
 from agent.component.base import ComponentBase, ComponentParamBase
 from api.utils.api_utils import timeout

@ -34,9 +36,9 @@ class ProcessBase(ComponentBase):
    def __init__(self, pipeline, id, param: ProcessParamBase):
        super().__init__(pipeline, id, param)
        if hasattr(self._canvas, "callback"):
-            self.callback = partial(self._canvas.callback, id)
+            self.callback = partial(self._canvas.callback, self.component_name)
        else:
-            self.callback = partial(lambda *args, **kwargs: None, id)
+            self.callback = partial(lambda *args, **kwargs: None, self.component_name)

    async def invoke(self, **kwargs) -> dict[str, Any]:
        self.set_output("_created_time", time.perf_counter())
@ -56,6 +58,6 @@ class ProcessBase(ComponentBase):
        self.set_output("_elapsed_time", time.perf_counter() - self.output("_created_time"))
        return self.output()

-    @timeout(int(os.environ.get("COMPONENT_EXEC_TIMEOUT", 10 * 60)))
+    @timeout(os.environ.get("COMPONENT_EXEC_TIMEOUT", 10 * 60))
    async def _invoke(self, **kwargs):
        raise NotImplementedError()
--- a/rag/flow/chunker/chunker.py
+++ b/rag/flow/chunker/chunker.py
@ -12,19 +12,18 @@
 #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
-import json
 import random
+
 import trio
+
 from api.db import LLMType
 from api.db.services.llm_service import LLMBundle
 from deepdoc.parser.pdf_parser import RAGFlowPdfParser
 from graphrag.utils import chat_limiter, get_llm_cache, set_llm_cache
 from rag.flow.base import ProcessBase, ProcessParamBase
 from rag.flow.chunker.schema import ChunkerFromUpstream
-from rag.nlp import naive_merge, naive_merge_with_images, concat_img
-from rag.prompts.prompts import keyword_extraction, question_proposal, detect_table_of_contents, \
-    table_of_contents_index, toc_transformer
-from rag.utils import num_tokens_from_string
+from rag.nlp import naive_merge, naive_merge_with_images
+from rag.prompts.generator import keyword_extraction, question_proposal


 class ChunkerParam(ProcessParamBase):
@ -44,7 +43,6 @@ class ChunkerParam(ProcessParamBase):
            "paper",
            "laws",
            "presentation",
-            "toc" # table of contents
            # Other
            # "Tag" # TODO: Other method
        ]
@ -56,7 +54,7 @@ class ChunkerParam(ProcessParamBase):
        self.auto_keywords = 0
        self.auto_questions = 0
        self.tag_sets = []
-        self.llm_setting = {"llm_id": "", "lang": "Chinese"}
+        self.llm_setting = {"llm_name": "", "lang": "Chinese"}

    def check(self):
        self.check_valid_value(self.method.lower(), "Chunk method abnormal.", self.method_options)
@ -144,91 +142,6 @@ class Chunker(ProcessBase):
    def _one(self, from_upstream: ChunkerFromUpstream):
        pass

-    def _toc(self, from_upstream: ChunkerFromUpstream):
-        self.callback(random.randint(1, 5) / 100.0, "Start to chunk via `ToC`.")
-        if from_upstream.output_format in ["markdown", "text", "html"]:
-            return
-
-        # json
-        sections, section_images, page_1024, tc_arr = [], [], [""], [0]
-        for o in from_upstream.json_result or []:
-            txt = o.get("text", "")
-            tc = num_tokens_from_string(txt)
-            page_1024[-1] += "\n" + txt
-            tc_arr[-1] += tc
-            if tc_arr[-1] > 1024:
-                page_1024.append("")
-                tc_arr.append(0)
-            sections.append((o.get("text", ""), o.get("position_tag", "")))
-            section_images.append(o.get("image"))
-            print(len(sections), o)
-
-        llm_setting = self._param.llm_setting
-        chat_mdl = LLMBundle(self._canvas._tenant_id, LLMType.CHAT, llm_name=llm_setting["llm_id"], lang=llm_setting["lang"])
-        self.callback(random.randint(5, 15) / 100.0, "Start to detect table of contents...")
-        toc_secs = detect_table_of_contents(page_1024, chat_mdl)
-        if toc_secs:
-            self.callback(random.randint(25, 35) / 100.0, "Start to extract table of contents...")
-            toc_arr = toc_transformer(toc_secs, chat_mdl)
-            toc_arr = [it for it in toc_arr if it.get("structure")]
-            print(json.dumps(toc_arr, ensure_ascii=False, indent=2), flush=True)
-            self.callback(random.randint(35, 75) / 100.0, "Start to link table of contents...")
-            toc_arr = table_of_contents_index(toc_arr, [t for t,_ in sections], chat_mdl)
-            for i in range(len(toc_arr)-1):
-                if not toc_arr[i].get("indices"):
-                    continue
-
-                for j in range(i+1, len(toc_arr)):
-                    if toc_arr[j].get("indices"):
-                        if toc_arr[j]["indices"][0] - toc_arr[i]["indices"][-1] > 1:
-                            toc_arr[i]["indices"].extend([x for x in range(toc_arr[i]["indices"][-1]+1, toc_arr[j]["indices"][0])])
-                        break
-            # put all sections ahead of toc_arr[0] into it
-            # for i in range(len(toc_arr)):
-            #     if toc_arr[i].get("indices") and toc_arr[i]["indices"][0]:
-            #         toc_arr[i]["indices"] = [x for x in range(toc_arr[i]["indices"][-1]+1)]
-            #         break
-            # put all sections after toc_arr[-1] into it
-            for i in range(len(toc_arr)-1, -1, -1):
-                if toc_arr[i].get("indices") and toc_arr[i]["indices"][-1]:
-                    toc_arr[i]["indices"] = [x for x in range(toc_arr[i]["indices"][0], len(sections))]
-                    break
-            print(">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>\n", json.dumps(toc_arr, ensure_ascii=False, indent=2), flush=True)
-
-            chunks, images = [], []
-            for it in toc_arr:
-                if not it.get("indices"):
-                    continue
-                txt = ""
-                img = None
-                for i in it["indices"]:
-                    idx = i
-                    txt += "\n" + sections[idx][0] + "\t" + sections[idx][1]
-                    if img and section_images[idx]:
-                        img = concat_img(img, section_images[idx])
-                    elif section_images[idx]:
-                        img = section_images[idx]
-
-                it["indices"] = []
-                if not txt:
-                    continue
-                it["indices"] = [len(chunks)]
-                print(it, "KKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKK\n", txt)
-                chunks.append(txt)
-                images.append(img)
-            self.callback(1, "Done")
-            return [
-                {
-                    "text": RAGFlowPdfParser.remove_tag(c),
-                    "image": img,
-                    "positions": RAGFlowPdfParser.extract_positions(c),
-                }
-                for c, img in zip(chunks, images)
-            ]
-
-        self.callback(message="No table of contents detected.")
-
-
    async def _invoke(self, **kwargs):
        function_map = {
            "general": self._general,
@ -254,7 +167,7 @@ class Chunker(ProcessBase):

        async def auto_keywords():
            nonlocal chunks, llm_setting
-            chat_mdl = LLMBundle(self._canvas._tenant_id, LLMType.CHAT, llm_name=llm_setting["llm_id"], lang=llm_setting["lang"])
+            chat_mdl = LLMBundle(self._canvas._tenant_id, LLMType.CHAT, llm_name=llm_setting["llm_name"], lang=llm_setting["lang"])

            async def doc_keyword_extraction(chat_mdl, ck, topn):
                cached = get_llm_cache(chat_mdl.llm_name, ck["text"], "keywords", {"topn": topn})
@ -271,7 +184,7 @@ class Chunker(ProcessBase):

        async def auto_questions():
            nonlocal chunks, llm_setting
-            chat_mdl = LLMBundle(self._canvas._tenant_id, LLMType.CHAT, llm_name=llm_setting["llm_id"], lang=llm_setting["lang"])
+            chat_mdl = LLMBundle(self._canvas._tenant_id, LLMType.CHAT, llm_name=llm_setting["llm_name"], lang=llm_setting["lang"])

            async def doc_question_proposal(chat_mdl, d, topn):
                cached = get_llm_cache(chat_mdl.llm_name, ck["text"], "question", {"topn": topn})
--- a/rag/flow/chunker/schema.py
+++ b/rag/flow/chunker/schema.py
@ -22,7 +22,7 @@ class ChunkerFromUpstream(BaseModel):
    elapsed_time: float | None = Field(default=None, alias="_elapsed_time")

    name: str
-    file: dict | None = Field(default=None)
+    blob: bytes

    output_format: Literal["json", "markdown", "text", "html"] | None = Field(default=None)

--- a/rag/flow/extractor/init.py
+++ b/rag/flow/extractor/init.py
@ -1,15 +0,0 @@
-#
-#  Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-
--- a/rag/flow/extractor/extractor.py
+++ b/rag/flow/extractor/extractor.py
@ -1,63 +0,0 @@
-#
-#  Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-import random
-from copy import deepcopy
-from agent.component.llm import LLMParam, LLM
-from rag.flow.base import ProcessBase, ProcessParamBase
-
-
-class ExtractorParam(ProcessParamBase, LLMParam):
-    def __init__(self):
-        super().__init__()
-        self.field_name = ""
-
-    def check(self):
-        super().check()
-        self.check_empty(self.field_name, "Result Destination")
-
-
-class Extractor(ProcessBase, LLM):
-    component_name = "Extractor"
-
-    async def _invoke(self, **kwargs):
-        self.set_output("output_format", "chunks")
-        self.callback(random.randint(1, 5) / 100.0, "Start to generate.")
-        inputs = self.get_input_elements()
-        chunks = []
-        chunks_key = ""
-        args = {}
-        for k, v in inputs.items():
-            args[k] = v["value"]
-            if isinstance(args[k], list):
-                chunks = deepcopy(args[k])
-                chunks_key = k
-
-        if chunks:
-            prog = 0
-            for i, ck in enumerate(chunks):
-                args[chunks_key] = ck["text"]
-                msg, sys_prompt = self._sys_prompt_and_msg([], args)
-                msg.insert(0, {"role": "system", "content": sys_prompt})
-                ck[self._param.field_name] = self._generate(msg)
-                prog += 1./len(chunks)
-                if i % (len(chunks)//100+1) == 1:
-                    self.callback(prog, f"{i+1} / {len(chunks)}")
-            self.set_output("chunks", chunks)
-        else:
-            msg, sys_prompt = self._sys_prompt_and_msg([], args)
-            msg.insert(0, {"role": "system", "content": sys_prompt})
-            self.set_output("chunks", [{self._param.field_name: self._generate(msg)}])
-
-
--- a/rag/flow/extractor/schema.py
+++ b/rag/flow/extractor/schema.py
@ -1,38 +0,0 @@
-#
-#  Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-from typing import Any, Literal
-
-from pydantic import BaseModel, ConfigDict, Field
-
-
-class ExtractorFromUpstream(BaseModel):
-    created_time: float | None = Field(default=None, alias="_created_time")
-    elapsed_time: float | None = Field(default=None, alias="_elapsed_time")
-
-    name: str
-    file: dict | None = Field(default=None)
-    chunks: list[dict[str, Any]] | None = Field(default=None)
-
-    output_format: Literal["json", "markdown", "text", "html", "chunks"] | None = Field(default=None)
-
-    json_result: list[dict[str, Any]] | None = Field(default=None, alias="json")
-    markdown_result: str | None = Field(default=None, alias="markdown")
-    text_result: str | None = Field(default=None, alias="text")
-    html_result: str | None = Field(default=None, alias="html")
-
-    model_config = ConfigDict(populate_by_name=True, extra="forbid")
-
-    # def to_dict(self, *, exclude_none: bool = True) -> dict:
-    #     return self.model_dump(by_alias=True, exclude_none=exclude_none)
--- a/rag/flow/file.py
+++ b/rag/flow/file.py
@ -14,7 +14,10 @@
 #  limitations under the License.
 #
 from api.db.services.document_service import DocumentService
+from api.db.services.file2document_service import File2DocumentService
+from api.db.services.file_service import FileService
 from rag.flow.base import ProcessBase, ProcessParamBase
+from rag.utils.storage_factory import STORAGE_IMPL


 class FileParam(ProcessParamBase):
@ -38,13 +41,10 @@ class File(ProcessBase):
                self.set_output("_ERROR", f"Document({self._canvas._doc_id}) not found!")
                return

-            #b, n = File2DocumentService.get_storage_address(doc_id=self._canvas._doc_id)
-            #self.set_output("blob", STORAGE_IMPL.get(b, n))
+            b, n = File2DocumentService.get_storage_address(doc_id=self._canvas._doc_id)
+            self.set_output("blob", STORAGE_IMPL.get(b, n))
            self.set_output("name", doc.name)
        else:
            file = kwargs.get("file")
            self.set_output("name", file["name"])
-            self.set_output("file", file)
-            #self.set_output("blob", FileService.get_blob(file["created_by"], file["id"]))
-
-        self.callback(1, "File fetched.")
+            self.set_output("blob", FileService.get_blob(file["created_by"], file["id"]))
--- a/rag/flow/hierarchical_merger/init.py
+++ b/rag/flow/hierarchical_merger/init.py
@ -1,15 +0,0 @@
-#
-#  Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-
--- a/rag/flow/hierarchical_merger/hierarchical_merger.py
+++ b/rag/flow/hierarchical_merger/hierarchical_merger.py
@ -1,186 +0,0 @@
-#
-#  Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-
-import random
-import re
-from copy import deepcopy
-from functools import partial
-
-import trio
-
-from api.utils import get_uuid
-from api.utils.base64_image import id2image, image2id
-from deepdoc.parser.pdf_parser import RAGFlowPdfParser
-from rag.flow.base import ProcessBase, ProcessParamBase
-from rag.flow.hierarchical_merger.schema import HierarchicalMergerFromUpstream
-from rag.nlp import concat_img
-from rag.utils.storage_factory import STORAGE_IMPL
-
-
-class HierarchicalMergerParam(ProcessParamBase):
-    def __init__(self):
-        super().__init__()
-        self.levels = []
-        self.hierarchy = None
-
-    def check(self):
-        self.check_empty(self.levels, "Hierarchical setups.")
-        self.check_empty(self.hierarchy, "Hierarchy number.")
-
-    def get_input_form(self) -> dict[str, dict]:
-        return {}
-
-
-class HierarchicalMerger(ProcessBase):
-    component_name = "HierarchicalMerger"
-
-    async def _invoke(self, **kwargs):
-        try:
-            from_upstream = HierarchicalMergerFromUpstream.model_validate(kwargs)
-        except Exception as e:
-            self.set_output("_ERROR", f"Input error: {str(e)}")
-            return
-
-        self.set_output("output_format", "chunks")
-        self.callback(random.randint(1, 5) / 100.0, "Start to merge hierarchically.")
-        if from_upstream.output_format in ["markdown", "text", "html"]:
-            if from_upstream.output_format == "markdown":
-                payload = from_upstream.markdown_result
-            elif from_upstream.output_format == "text":
-                payload = from_upstream.text_result
-            else:  # == "html"
-                payload = from_upstream.html_result
-
-            if not payload:
-                payload = ""
-
-            lines = [ln for ln in payload.split("\n") if ln]
-        else:
-            arr = from_upstream.chunks if from_upstream.output_format == "chunks" else from_upstream.json_result
-            lines = [o.get("text", "") for o in arr]
-            sections, section_images = [], []
-            for o in arr or []:
-                sections.append((o.get("text", ""), o.get("position_tag", "")))
-                section_images.append(o.get("img_id"))
-
-        matches = []
-        for txt in lines:
-            good = False
-            for lvl, regs in enumerate(self._param.levels):
-                for reg in regs:
-                    if re.search(reg, txt):
-                        matches.append(lvl)
-                        good = True
-                        break
-                if good:
-                    break
-            if not good:
-                matches.append(len(self._param.levels))
-        assert len(matches) == len(lines), f"{len(matches)} vs. {len(lines)}"
-
-        root = {
-            "level": -1,
-            "index": -1,
-            "texts": [],
-            "children": []
-        }
-        for i, m in enumerate(matches):
-            if m == 0:
-                root["children"].append({
-                    "level": m,
-                    "index": i,
-                    "texts": [],
-                    "children": []
-                })
-            elif m == len(self._param.levels):
-                def dfs(b):
-                    if not b["children"]:
-                        b["texts"].append(i)
-                    else:
-                        dfs(b["children"][-1])
-                dfs(root)
-            else:
-                def dfs(b):
-                    nonlocal m, i
-                    if not b["children"] or  m == b["level"] + 1:
-                        b["children"].append({
-                            "level": m,
-                            "index": i,
-                            "texts": [],
-                            "children": []
-                        })
-                        return
-                    dfs(b["children"][-1])
-
-                dfs(root)
-
-        all_pathes = []
-        def dfs(n, path, depth):
-            nonlocal all_pathes
-            if not n["children"] and path:
-                all_pathes.append(path)
-
-            for nn in n["children"]:
-                if depth < self._param.hierarchy:
-                    _path = deepcopy(path)
-                else:
-                    _path = path
-                _path.extend([nn["index"], *nn["texts"]])
-                dfs(nn, _path, depth+1)
-
-                if depth == self._param.hierarchy:
-                    all_pathes.append(_path)
-
-        for i in range(len(lines)):
-            print(i, lines[i])
-        dfs(root, [], 0)
-
-        if root["texts"]:
-            all_pathes.insert(0, root["texts"])
-        if from_upstream.output_format in ["markdown", "text", "html"]:
-            cks = []
-            for path in all_pathes:
-                txt = ""
-                for i in path:
-                    txt += lines[i] + "\n"
-                cks.append(txt)
-
-            self.set_output("chunks", [{"text": c} for c in cks if c])
-        else:
-            cks = []
-            images = []
-            for path in all_pathes:
-                txt = ""
-                img = None
-                for i in path:
-                    txt += lines[i] + "\n"
-                    concat_img(img, id2image(section_images[i], partial(STORAGE_IMPL.get)))
-                cks.append(txt)
-                images.append(img)
-
-            cks = [
-                {
-                    "text": RAGFlowPdfParser.remove_tag(c),
-                    "image": img,
-                    "positions": RAGFlowPdfParser.extract_positions(c),
-                }
-                for c, img in zip(cks, images)
-            ]
-            async with trio.open_nursery() as nursery:
-                for d in cks:
-                    nursery.start_soon(image2id, d, partial(STORAGE_IMPL.put), get_uuid())
-            self.set_output("chunks", cks)
-
-        self.callback(1, "Done.")
--- a/rag/flow/hierarchical_merger/schema.py
+++ b/rag/flow/hierarchical_merger/schema.py
@ -1,37 +0,0 @@
-#
-#  Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-from typing import Any, Literal
-
-from pydantic import BaseModel, ConfigDict, Field
-
-
-class HierarchicalMergerFromUpstream(BaseModel):
-    created_time: float | None = Field(default=None, alias="_created_time")
-    elapsed_time: float | None = Field(default=None, alias="_elapsed_time")
-
-    name: str
-    file: dict | None = Field(default=None)
-    chunks: list[dict[str, Any]] | None = Field(default=None)
-
-    output_format: Literal["json", "chunks"] | None = Field(default=None)
-    json_result: list[dict[str, Any]] | None = Field(default=None, alias="json")
-    markdown_result: str | None = Field(default=None, alias="markdown")
-    text_result: str | None = Field(default=None, alias="text")
-    html_result: str | None = Field(default=None, alias="html")
-
-    model_config = ConfigDict(populate_by_name=True, extra="forbid")
-
-    # def to_dict(self, *, exclude_none: bool = True) -> dict:
-    #     return self.model_dump(by_alias=True, exclude_none=exclude_none)
--- a/rag/flow/parser/parser.py
+++ b/rag/flow/parser/parser.py
@ -13,28 +13,20 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 import io
-import json
-import os
+import logging
 import random
-from functools import partial

 import trio
 import numpy as np
 from PIL import Image

 from api.db import LLMType
-from api.db.services.file2document_service import File2DocumentService
-from api.db.services.file_service import FileService
 from api.db.services.llm_service import LLMBundle
-from api.utils import get_uuid
-from api.utils.base64_image import image2id
 from deepdoc.parser import ExcelParser
 from deepdoc.parser.pdf_parser import PlainParser, RAGFlowPdfParser, VisionParser
-from rag.app.naive import Docx
 from rag.flow.base import ProcessBase, ProcessParamBase
 from rag.flow.parser.schema import ParserFromUpstream
 from rag.llm.cv_model import Base as VLM
-from rag.utils.storage_factory import STORAGE_IMPL


 class ParserParam(ProcessParamBase):
@ -53,14 +45,12 @@ class ParserParam(ProcessParamBase):
            "word": [
                "json",
            ],
-            "slides": [
-                "json",
-            ],
+            "ppt": [],
            "image": [
                "text"
            ],
-            "email": ["text", "json"],
-            "text&markdown": [
+            "email": [],
+            "text": [
                "text",
                "json"
            ],
@ -73,6 +63,7 @@ class ParserParam(ProcessParamBase):
        self.setups = {
            "pdf": {
                "parse_method": "deepdoc",  # deepdoc/plain_text/vlm
+                "llm_id": "",
                "lang": "Chinese",
                "suffix": [
                    "pdf",
@ -94,29 +85,23 @@ class ParserParam(ProcessParamBase):
                ],
                "output_format": "json",
            },
-            "text&markdown": {
-                "suffix": ["md", "markdown", "mdx", "txt"],
-                "output_format": "json",
-            },
-            "slides": {
-                "suffix": [
-                    "pptx",
-                ],
+            "markdown": {
+                "suffix": ["md", "markdown"],
                "output_format": "json",
            },
+            "ppt": {},
            "image": {
-                "parse_method": "ocr",
+                "parse_method": ["ocr", "vlm"],
                "llm_id": "",
                "lang": "Chinese",
-                "system_prompt": "",
                "suffix": ["jpg", "jpeg", "png", "gif"],
-                "output_format": "text",
+                "output_format": "json",
            },
-            "email": {
+            "email": {},
+            "text": {
                "suffix": [
-                  "eml", "msg"
+                    "txt"
                ],
-                "fields": ["from", "to", "cc", "bcc", "date", "subject", "body", "attachments", "metadata"],
                "output_format": "json",
            },
            "audio": {
@ -146,10 +131,13 @@ class ParserParam(ProcessParamBase):
        pdf_config = self.setups.get("pdf", {})
        if pdf_config:
            pdf_parse_method = pdf_config.get("parse_method", "")
-            self.check_empty(pdf_parse_method, "Parse method abnormal.")
+            self.check_valid_value(pdf_parse_method.lower(), "Parse method abnormal.", ["deepdoc", "plain_text", "vlm"])

-            if pdf_parse_method.lower() not in ["deepdoc", "plain_text"]:
-                self.check_empty(pdf_config.get("lang", ""), "PDF VLM language")
+            if pdf_parse_method not in ["deepdoc", "plain_text"]:
+                self.check_empty(pdf_config.get("llm_id"), "VLM")
+
+            pdf_language = pdf_config.get("lang", "")
+            self.check_empty(pdf_language, "Language")

            pdf_output_format = pdf_config.get("output_format", "")
            self.check_valid_value(pdf_output_format, "PDF output format abnormal.", self.allowed_output_format["pdf"])
@ -159,38 +147,32 @@ class ParserParam(ProcessParamBase):
            spreadsheet_output_format = spreadsheet_config.get("output_format", "")
            self.check_valid_value(spreadsheet_output_format, "Spreadsheet output format abnormal.", self.allowed_output_format["spreadsheet"])

-        doc_config = self.setups.get("word", "")
+        doc_config = self.setups.get("doc", "")
        if doc_config:
            doc_output_format = doc_config.get("output_format", "")
-            self.check_valid_value(doc_output_format, "Word processer document output format abnormal.", self.allowed_output_format["word"])
-
-        slides_config = self.setups.get("slides", "")
-        if slides_config:
-            slides_output_format = slides_config.get("output_format", "")
-            self.check_valid_value(slides_output_format, "Slides output format abnormal.", self.allowed_output_format["slides"])
+            self.check_valid_value(doc_output_format, "Word processer document output format abnormal.", self.allowed_output_format["doc"])

        image_config = self.setups.get("image", "")
        if image_config:
            image_parse_method = image_config.get("parse_method", "")
+            self.check_valid_value(image_parse_method.lower(), "Parse method abnormal.", ["ocr", "vlm"])
            if image_parse_method not in ["ocr"]:
-                self.check_empty(image_config.get("lang", ""), "Image VLM language")
+                self.check_empty(image_config.get("llm_id"), "VLM")

-        text_config = self.setups.get("text&markdown", "")
+            image_language = image_config.get("lang", "")
+            self.check_empty(image_language, "Language")
+
+        text_config = self.setups.get("text", "")
        if text_config:
            text_output_format = text_config.get("output_format", "")
-            self.check_valid_value(text_output_format, "Text output format abnormal.", self.allowed_output_format["text&markdown"])
+            self.check_valid_value(text_output_format, "Text output format abnormal.", self.allowed_output_format["text"])

        audio_config = self.setups.get("audio", "")
        if audio_config:
-            self.check_empty(audio_config.get("llm_id"), "Audio VLM")
+            self.check_empty(audio_config.get("llm_id"), "VLM")
            audio_language = audio_config.get("lang", "")
            self.check_empty(audio_language, "Language")

-        email_config = self.setups.get("email", "")
-        if email_config:
-            email_output_format = email_config.get("output_format", "")
-            self.check_valid_value(email_output_format, "Email output format abnormal.", self.allowed_output_format["email"])
-
    def get_input_form(self) -> dict[str, dict]:
        return {}

@ -198,18 +180,21 @@ class ParserParam(ProcessParamBase):
 class Parser(ProcessBase):
    component_name = "Parser"

-    def _pdf(self, name, blob):
+    def _pdf(self, from_upstream: ParserFromUpstream):
        self.callback(random.randint(1, 5) / 100.0, "Start to work on a PDF.")
+
+        blob = from_upstream.blob
        conf = self._param.setups["pdf"]
        self.set_output("output_format", conf["output_format"])

-        if conf.get("parse_method").lower() == "deepdoc":
+        if conf.get("parse_method") == "deepdoc":
            bboxes = RAGFlowPdfParser().parse_into_bboxes(blob, callback=self.callback)
-        elif conf.get("parse_method").lower() == "plain_text":
+        elif conf.get("parse_method") == "plain_text":
            lines, _ = PlainParser()(blob)
            bboxes = [{"text": t} for t, _ in lines]
        else:
-            vision_model = LLMBundle(self._canvas._tenant_id, LLMType.IMAGE2TEXT, llm_name=conf.get("parse_method"), lang=self._param.setups["pdf"].get("lang"))
+            assert conf.get("llm_id")
+            vision_model = LLMBundle(self._canvas._tenant_id, LLMType.IMAGE2TEXT, llm_name=conf.get("llm_id"), lang=self._param.setups["pdf"].get("lang"))
            lines, _ = VisionParser(vision_model=vision_model)(blob, callback=self.callback)
            bboxes = []
            for t, poss in lines:
@ -229,63 +214,66 @@ class Parser(ProcessBase):
                mkdn += b.get("text", "") + "\n"
            self.set_output("markdown", mkdn)

-    def _spreadsheet(self, name, blob):
+    def _spreadsheet(self, from_upstream: ParserFromUpstream):
        self.callback(random.randint(1, 5) / 100.0, "Start to work on a Spreadsheet.")
+
+        blob = from_upstream.blob
        conf = self._param.setups["spreadsheet"]
        self.set_output("output_format", conf["output_format"])
+
+        print("spreadsheet {conf=}", flush=True)
        spreadsheet_parser = ExcelParser()
        if conf.get("output_format") == "html":
-            htmls = spreadsheet_parser.html(blob, 1000000000)
-            self.set_output("html", htmls[0])
+            html = spreadsheet_parser.html(blob, 1000000000)
+            self.set_output("html", html)
        elif conf.get("output_format") == "json":
            self.set_output("json", [{"text": txt} for txt in spreadsheet_parser(blob) if txt])
        elif conf.get("output_format") == "markdown":
            self.set_output("markdown", spreadsheet_parser.markdown(blob))

-    def _word(self, name, blob):
+    def _word(self, from_upstream: ParserFromUpstream):
+        from tika import parser as  word_parser
+
        self.callback(random.randint(1, 5) / 100.0, "Start to work on a Word Processor Document")
+
+        blob = from_upstream.blob
+        name = from_upstream.name
        conf = self._param.setups["word"]
        self.set_output("output_format", conf["output_format"])
-        docx_parser = Docx()
-        sections, tbls = docx_parser(name, binary=blob)
-        sections = [{"text": section[0], "image": section[1]} for section in sections if section]
-        sections.extend([{"text": tb, "image": None} for ((_,tb), _) in tbls])
+
+        print("word {conf=}", flush=True)
+        doc_parsed = word_parser.from_buffer(blob)
+
+        sections = []
+        if doc_parsed.get("content"):
+            sections = doc_parsed["content"].split("\n")
+            sections = [{"text": section} for section in sections if section]
+        else:
+            logging.warning(f"tika.parser got empty content from {name}.")
+
        # json
        assert conf.get("output_format") == "json", "have to be json for doc"
        if conf.get("output_format") == "json":
            self.set_output("json", sections)

-    def _slides(self, name, blob):
-        from deepdoc.parser.ppt_parser import RAGFlowPptParser as ppt_parser
-
-        self.callback(random.randint(1, 5) / 100.0, "Start to work on a PowerPoint Document")
-
-        conf = self._param.setups["slides"]
-        self.set_output("output_format", conf["output_format"])
-
-        ppt_parser = ppt_parser()
-        txts = ppt_parser(blob, 0, 100000, None)
-
-        sections = [{"text": section} for section in txts if section.strip()]
-
-        # json
-        assert conf.get("output_format") == "json", "have to be json for ppt"
-        if conf.get("output_format") == "json":
-            self.set_output("json", sections)
-
-    def _markdown(self, name, blob):
+    def _markdown(self, from_upstream: ParserFromUpstream):
        from functools import reduce

        from rag.app.naive import Markdown as naive_markdown_parser
        from rag.nlp import concat_img

        self.callback(random.randint(1, 5) / 100.0, "Start to work on a markdown.")
-        conf = self._param.setups["text&markdown"]
+
+        blob = from_upstream.blob
+        name = from_upstream.name
+        conf = self._param.setups["markdown"]
        self.set_output("output_format", conf["output_format"])

        markdown_parser = naive_markdown_parser()
        sections, tables = markdown_parser(name, blob, separate_tables=False)

+        # json
+        assert conf.get("output_format") == "json", "have to be json for doc"
        if conf.get("output_format") == "json":
            json_results = []

@ -303,51 +291,69 @@ class Parser(ProcessBase):
                json_results.append(json_result)

            self.set_output("json", json_results)
+
+    def _text(self, from_upstream: ParserFromUpstream):
+        from deepdoc.parser.utils import get_text
+
+        self.callback(random.randint(1, 5) / 100.0, "Start to work on a text.")
+
+        blob = from_upstream.blob
+        name = from_upstream.name
+        conf = self._param.setups["text"]
+        self.set_output("output_format", conf["output_format"])
+
+        # parse binary to text
+        text_content = get_text(name, binary=blob)
+
+        if conf.get("output_format") == "json":
+            result = [{"text": text_content}]
+            self.set_output("json", result)
        else:
-            self.set_output("text", "\n".join([section_text for section_text, _ in sections]))
+            result = text_content
+            self.set_output("text", result)

-
-    def _image(self, name, blob):
+    def _image(self, from_upstream: ParserFromUpstream):
        from deepdoc.vision import OCR

        self.callback(random.randint(1, 5) / 100.0, "Start to work on an image.")
+
+        blob = from_upstream.blob
        conf = self._param.setups["image"]
        self.set_output("output_format", conf["output_format"])

        img = Image.open(io.BytesIO(blob)).convert("RGB")
+        lang = conf["lang"]

        if conf["parse_method"] == "ocr":
            # use ocr, recognize chars only
            ocr = OCR()
            bxs = ocr(np.array(img))  # return boxes and recognize result
            txt = "\n".join([t[0] for _, t in bxs if t[0]])
+
        else:
-            lang = conf["lang"]
            # use VLM to describe the picture
-            cv_model = LLMBundle(self._canvas.get_tenant_id(), LLMType.IMAGE2TEXT, llm_name=conf["parse_method"], lang=lang)
+            cv_model = LLMBundle(self._canvas.get_tenant_id(), LLMType.IMAGE2TEXT, llm_name=conf["llm_id"],lang=lang)
            img_binary = io.BytesIO()
            img.save(img_binary, format="JPEG")
            img_binary.seek(0)
-
-            system_prompt = conf.get("system_prompt")
-            if system_prompt:
-                txt = cv_model.describe_with_prompt(img_binary.read(), system_prompt)
-            else:
-                txt = cv_model.describe(img_binary.read())
+            txt = cv_model.describe(img_binary.read())

        self.set_output("text", txt)

-    def _audio(self, name, blob):
+    def _audio(self, from_upstream: ParserFromUpstream):
        import os
        import tempfile

        self.callback(random.randint(1, 5) / 100.0, "Start to work on an audio.")

+        blob = from_upstream.blob
+        name = from_upstream.name
        conf = self._param.setups["audio"]
        self.set_output("output_format", conf["output_format"])

        lang = conf["lang"]
        _, ext = os.path.splitext(name)
+        tmp_path = ""
        with tempfile.NamedTemporaryFile(suffix=ext) as tmpf:
            tmpf.write(blob)
            tmpf.flush()
@ -358,131 +364,15 @@ class Parser(ProcessBase):

            self.set_output("text", txt)

-    def _email(self, name, blob):
-        self.callback(random.randint(1, 5) / 100.0, "Start to work on an email.")
-
-        email_content = {}
-        conf = self._param.setups["email"]
-        target_fields = conf["fields"]
-
-        _, ext = os.path.splitext(name)
-        if ext == ".eml":
-            # handle eml file
-            from email import policy
-            from email.parser import BytesParser
-
-            msg = BytesParser(policy=policy.default).parse(io.BytesIO(blob))
-            email_content['metadata'] = {}
-            # handle header info
-            for header, value in msg.items():
-                # get fields like from, to, cc, bcc, date, subject
-                if header.lower() in target_fields:
-                    email_content[header.lower()] = value
-                # get metadata
-                elif header.lower() not in ["from", "to", "cc", "bcc", "date", "subject"]:
-                    email_content["metadata"][header.lower()] = value
-            # get body
-            if "body" in target_fields:
-                body_text, body_html = [], []
-                def _add_content(m, content_type):
-                    if content_type == "text/plain":
-                        body_text.append(
-                            m.get_payload(decode=True).decode(m.get_content_charset())
-                        )
-                    elif content_type == "text/html":
-                        body_html.append(
-                            m.get_payload(decode=True).decode(m.get_content_charset())
-                        )
-                    elif "multipart" in content_type:
-                        if m.is_multipart():
-                            for part in m.iter_parts():
-                                _add_content(part, part.get_content_type())
-
-                _add_content(msg, msg.get_content_type())
-
-                email_content["text"] = body_text
-                email_content["text_html"] = body_html
-            # get attachment
-            if "attachments" in target_fields:
-                attachments = []
-                for part in msg.iter_attachments():
-                    content_disposition = part.get("Content-Disposition")
-                    if content_disposition:
-                        dispositions = content_disposition.strip().split(";")
-                        if dispositions[0].lower() == "attachment":
-                            filename = part.get_filename()
-                            payload = part.get_payload(decode=True)
-                            attachments.append({
-                                "filename": filename,
-                                "payload": payload,
-                            })
-                email_content["attachments"] = attachments
-        else:
-            # handle msg file
-            import extract_msg
-            print("handle a msg file.")
-            msg = extract_msg.Message(blob)
-            # handle header info
-            basic_content = {
-                "from": msg.sender,
-                "to": msg.to,
-                "cc": msg.cc,
-                "bcc": msg.bcc,
-                "date": msg.date,
-                "subject": msg.subject,
-            }
-            email_content.update({k: v for k, v in basic_content.items() if k in target_fields})
-            # get metadata
-            email_content['metadata'] = {
-                'message_id': msg.messageId,
-                'in_reply_to': msg.inReplyTo,
-            }
-            # get body
-            if "body" in target_fields:
-                email_content["text"] = msg.body  # usually empty. try text_html instead
-                email_content["text_html"] = msg.htmlBody
-            # get attachments
-            if "attachments" in target_fields:
-                attachments = []
-                for t in msg.attachments:
-                    attachments.append({
-                        "filename": t.name,
-                        "payload": t.data  # binary
-                    })
-                email_content["attachments"] = attachments
-
-        if conf["output_format"] == "json":
-            self.set_output("json", [email_content])
-        else:
-            content_txt = ''
-            for k, v in email_content.items():
-                if isinstance(v, str):
-                    # basic info
-                    content_txt += f'{k}:{v}' + "\n"
-                elif isinstance(v, dict):
-                    # metadata
-                    content_txt += f'{k}:{json.dumps(v)}' + "\n"
-                elif isinstance(v, list):
-                    # attachments or others
-                    for fb in v:
-                        if isinstance(fb, dict):
-                            # attachments
-                            content_txt += f'{fb["filename"]}:{fb["payload"]}' + "\n"
-                        else:
-                            # str, usually plain text
-                            content_txt += fb
-            self.set_output("text", content_txt)
-
    async def _invoke(self, **kwargs):
        function_map = {
            "pdf": self._pdf,
-            "text&markdown": self._markdown,
+            "markdown": self._markdown,
            "spreadsheet": self._spreadsheet,
-            "slides": self._slides,
            "word": self._word,
+            "text": self._text,
            "image": self._image,
            "audio": self._audio,
-            "email": self._email,
        }
        try:
            from_upstream = ParserFromUpstream.model_validate(kwargs)
@ -490,25 +380,8 @@ class Parser(ProcessBase):
            self.set_output("_ERROR", f"Input error: {str(e)}")
            return

-        name = from_upstream.name
-        if self._canvas._doc_id:
-            b, n = File2DocumentService.get_storage_address(doc_id=self._canvas._doc_id)
-            blob = STORAGE_IMPL.get(b, n)
-        else:
-            blob = FileService.get_blob(from_upstream.file["created_by"], from_upstream.file["id"])
-
-        done = False
        for p_type, conf in self._param.setups.items():
            if from_upstream.name.split(".")[-1].lower() not in conf.get("suffix", []):
                continue
-            await trio.to_thread.run_sync(function_map[p_type], name, blob)
-            done = True
+            await trio.to_thread.run_sync(function_map[p_type], from_upstream)
            break
-
-        if not done:
-            raise Exception("No suitable for file extension: `.%s`" % from_upstream.name.split(".")[-1].lower())
-
-        outs = self.output()
-        async with trio.open_nursery() as nursery:
-            for d in outs.get("json", []):
-                nursery.start_soon(image2id, d, partial(STORAGE_IMPL.put), get_uuid())
--- a/rag/flow/parser/schema.py
+++ b/rag/flow/parser/schema.py
@ -20,5 +20,6 @@ class ParserFromUpstream(BaseModel):
    elapsed_time: float | None = Field(default=None, alias="_elapsed_time")

    name: str
-    file: dict | None = Field(default=None)
+    blob: bytes
+
    model_config = ConfigDict(populate_by_name=True, extra="forbid")
--- a/rag/flow/pipeline.py
+++ b/rag/flow/pipeline.py
@ -17,92 +17,41 @@ import datetime
 import json
 import logging
 import random
-from timeit import default_timer as timer
+import time
+
 import trio
+
 from agent.canvas import Graph
 from api.db.services.document_service import DocumentService
-from api.db.services.task_service import has_canceled, TaskService, CANVAS_DEBUG_DOC_ID
 from rag.utils.redis_conn import REDIS_CONN


 class Pipeline(Graph):
-    def __init__(self, dsl: str|dict, tenant_id=None, doc_id=None, task_id=None, flow_id=None):
-        if isinstance(dsl, dict):
-            dsl = json.dumps(dsl, ensure_ascii=False)
+    def __init__(self, dsl: str, tenant_id=None, doc_id=None, task_id=None, flow_id=None):
        super().__init__(dsl, tenant_id, task_id)
-        if doc_id == CANVAS_DEBUG_DOC_ID:
-            doc_id = None
        self._doc_id = doc_id
        self._flow_id = flow_id
        self._kb_id = None
-        if self._doc_id:
+        if doc_id:
            self._kb_id = DocumentService.get_knowledgebase_id(doc_id)
-            if not self._kb_id:
-                self._doc_id = None
+            assert self._kb_id, f"Can't find KB of this document: {doc_id}"

    def callback(self, component_name: str, progress: float | int | None = None, message: str = "") -> None:
-        from rag.svr.task_executor import TaskCanceledException
        log_key = f"{self._flow_id}-{self.task_id}-logs"
-        timestamp = timer()
-        if has_canceled(self.task_id):
-            progress = -1
-            message += "[CANCEL]"
        try:
            bin = REDIS_CONN.get(log_key)
            obj = json.loads(bin.encode("utf-8"))
            if obj:
-                if obj[-1]["component_id"] == component_name:
-                    obj[-1]["trace"].append(
-                        {
-                            "progress": progress,
-                            "message": message,
-                            "datetime": datetime.datetime.now().strftime("%H:%M:%S"),
-                            "timestamp": timestamp,
-                            "elapsed_time": timestamp - obj[-1]["trace"][-1]["timestamp"],
-                        }
-                    )
+                if obj[-1]["component_name"] == component_name:
+                    obj[-1]["trace"].append({"progress": progress, "message": message, "datetime": datetime.datetime.now().strftime("%H:%M:%S")})
                else:
-                    obj.append(
-                        {
-                            "component_id": component_name,
-                            "trace": [{"progress": progress, "message": message, "datetime": datetime.datetime.now().strftime("%H:%M:%S"), "timestamp": timestamp, "elapsed_time": 0}],
-                        }
-                    )
+                    obj.append({"component_name": component_name, "trace": [{"progress": progress, "message": message, "datetime": datetime.datetime.now().strftime("%H:%M:%S")}]})
            else:
-                obj = [
-                    {
-                        "component_id": component_name,
-                        "trace": [{"progress": progress, "message": message, "datetime": datetime.datetime.now().strftime("%H:%M:%S"), "timestamp": timestamp, "elapsed_time": 0}],
-                    }
-                ]
-            if component_name != "END" and self._doc_id and self.task_id:
-                percentage = 1.0 / len(self.components.items())
-                finished = 0.0
-                for o in obj:
-                    for t in o["trace"]:
-                        if t["progress"] < 0:
-                            finished = -1
-                            break
-                    if finished < 0:
-                        break
-                    finished += o["trace"][-1]["progress"] * percentage
-
-                msg = ""
-                if len(obj[-1]["trace"]) == 1:
-                    msg += f"\n-------------------------------------\n[{self.get_component_name(o['component_id'])}]:\n"
-                t = obj[-1]["trace"][-1]
-                msg += "%s: %s\n" % (t["datetime"], t["message"])
-                TaskService.update_progress(self.task_id, {"progress": finished, "progress_msg": msg})
-            elif component_name == "END" and not self._doc_id:
-                obj[-1]["trace"][-1]["dsl"] = json.loads(str(self))
-            REDIS_CONN.set_obj(log_key, obj, 60 * 30)
-
+                obj = [{"component_name": component_name, "trace": [{"progress": progress, "message": message, "datetime": datetime.datetime.now().strftime("%H:%M:%S")}]}]
+            REDIS_CONN.set_obj(log_key, obj, 60 * 10)
        except Exception as e:
            logging.exception(e)

-        if has_canceled(self.task_id):
-            raise TaskCanceledException(message)
-
    def fetch_logs(self):
        log_key = f"{self._flow_id}-{self.task_id}-logs"
        try:
@ -113,32 +62,34 @@ class Pipeline(Graph):
            logging.exception(e)
        return []

-
-    async def run(self, **kwargs):
+    def reset(self):
+        super().reset()
        log_key = f"{self._flow_id}-{self.task_id}-logs"
        try:
            REDIS_CONN.set_obj(log_key, [], 60 * 10)
        except Exception as e:
            logging.exception(e)
-        self.error = ""
+
+    async def run(self, **kwargs):
+        st = time.perf_counter()
        if not self.path:
            self.path.append("File")
+
+        if self._doc_id:
+            DocumentService.update_by_id(
+                self._doc_id, {"progress": random.randint(0, 5) / 100.0, "progress_msg": "Start the pipeline...", "process_begin_at": datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")}
+            )
+
+        self.error = ""
+        idx = len(self.path) - 1
+        if idx == 0:
            cpn_obj = self.get_component_obj(self.path[0])
            await cpn_obj.invoke(**kwargs)
            if cpn_obj.error():
                self.error = "[ERROR]" + cpn_obj.error()
-                self.callback(cpn_obj.component_name, -1, self.error)
-
-        if self._doc_id:
-            TaskService.update_progress(self.task_id, {
-                "progress": random.randint(0, 5) / 100.0,
-                "progress_msg": "Start the pipeline...",
-                "begin_at": datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")})
-
-        idx = len(self.path) - 1
-        cpn_obj = self.get_component_obj(self.path[idx])
-        idx += 1
-        self.path.extend(cpn_obj.get_downstream())
+            else:
+                idx += 1
+                self.path.extend(cpn_obj.get_downstream())

        while idx < len(self.path) and not self.error:
            last_cpn = self.get_component_obj(self.path[idx - 1])
@ -147,28 +98,15 @@ class Pipeline(Graph):
            async def invoke():
                nonlocal last_cpn, cpn_obj
                await cpn_obj.invoke(**last_cpn.output())
-                #if inspect.iscoroutinefunction(cpn_obj.invoke):
-                #    await cpn_obj.invoke(**last_cpn.output())
-                #else:
-                #    cpn_obj.invoke(**last_cpn.output())

            async with trio.open_nursery() as nursery:
                nursery.start_soon(invoke)
-
            if cpn_obj.error():
                self.error = "[ERROR]" + cpn_obj.error()
-                self.callback(cpn_obj._id, -1, self.error)
+                self.callback(cpn_obj.component_name, -1, self.error)
                break
            idx += 1
            self.path.extend(cpn_obj.get_downstream())

-        self.callback("END", 1 if not self.error else -1, json.dumps(self.get_component_obj(self.path[-1]).output(), ensure_ascii=False))
-
-        if not self.error:
-            return self.get_component_obj(self.path[-1]).output()
-
-        TaskService.update_progress(self.task_id, {
-            "progress": -1,
-            "progress_msg": f"[ERROR]: {self.error}"})
-
-        return {}
+        if self._doc_id:
+            DocumentService.update_by_id(self._doc_id, {"progress": 1 if not self.error else -1, "progress_msg": "Pipeline finished...\n" + self.error, "process_duration": time.perf_counter() - st})
--- a/rag/flow/splitter/init.py
+++ b/rag/flow/splitter/init.py
@ -1,15 +0,0 @@
-#
-#  Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-
--- a/rag/flow/splitter/schema.py
+++ b/rag/flow/splitter/schema.py
@ -1,38 +0,0 @@
-#
-#  Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-from typing import Any, Literal
-
-from pydantic import BaseModel, ConfigDict, Field
-
-
-class SplitterFromUpstream(BaseModel):
-    created_time: float | None = Field(default=None, alias="_created_time")
-    elapsed_time: float | None = Field(default=None, alias="_elapsed_time")
-
-    name: str
-    file: dict | None = Field(default=None)
-    chunks: list[dict[str, Any]] | None = Field(default=None)
-
-    output_format: Literal["json", "markdown", "text", "html"] | None = Field(default=None)
-
-    json_result: list[dict[str, Any]] | None = Field(default=None, alias="json")
-    markdown_result: str | None = Field(default=None, alias="markdown")
-    text_result: str | None = Field(default=None, alias="text")
-    html_result: str | None = Field(default=None, alias="html")
-
-    model_config = ConfigDict(populate_by_name=True, extra="forbid")
-
-    # def to_dict(self, *, exclude_none: bool = True) -> dict:
-    #     return self.model_dump(by_alias=True, exclude_none=exclude_none)
--- a/rag/flow/splitter/splitter.py
+++ b/rag/flow/splitter/splitter.py
@ -1,111 +0,0 @@
-#
-#  Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-import random
-from functools import partial
-
-import trio
-
-from api.utils import get_uuid
-from api.utils.base64_image import id2image, image2id
-from deepdoc.parser.pdf_parser import RAGFlowPdfParser
-from rag.flow.base import ProcessBase, ProcessParamBase
-from rag.flow.splitter.schema import SplitterFromUpstream
-from rag.nlp import naive_merge, naive_merge_with_images
-from rag.utils.storage_factory import STORAGE_IMPL
-
-
-class SplitterParam(ProcessParamBase):
-    def __init__(self):
-        super().__init__()
-        self.chunk_token_size = 512
-        self.delimiters = ["\n"]
-        self.overlapped_percent = 0
-
-    def check(self):
-        self.check_empty(self.delimiters, "Delimiters.")
-        self.check_positive_integer(self.chunk_token_size, "Chunk token size.")
-        self.check_decimal_float(self.overlapped_percent, "Overlapped percentage: [0, 1)")
-
-    def get_input_form(self) -> dict[str, dict]:
-        return {}
-
-
-class Splitter(ProcessBase):
-    component_name = "Splitter"
-
-    async def _invoke(self, **kwargs):
-        try:
-            from_upstream = SplitterFromUpstream.model_validate(kwargs)
-        except Exception as e:
-            self.set_output("_ERROR", f"Input error: {str(e)}")
-            return
-
-        deli = ""
-        for d in self._param.delimiters:
-            if len(d) > 1:
-                deli += f"`{d}`"
-            else:
-                deli += d
-
-        self.set_output("output_format", "chunks")
-        self.callback(random.randint(1, 5) / 100.0, "Start to split into chunks.")
-        if from_upstream.output_format in ["markdown", "text", "html"]:
-            if from_upstream.output_format == "markdown":
-                payload = from_upstream.markdown_result
-            elif from_upstream.output_format == "text":
-                payload = from_upstream.text_result
-            else:  # == "html"
-                payload = from_upstream.html_result
-
-            if not payload:
-                payload = ""
-
-            cks = naive_merge(
-                payload,
-                self._param.chunk_token_size,
-                deli,
-                self._param.overlapped_percent,
-            )
-            self.set_output("chunks", [{"text": c.strip()} for c in cks if c.strip()])
-
-            self.callback(1, "Done.")
-            return
-
-        # json
-        sections, section_images = [], []
-        for o in from_upstream.json_result or []:
-            sections.append((o.get("text", ""), o.get("position_tag", "")))
-            section_images.append(id2image(o.get("img_id"), partial(STORAGE_IMPL.get)))
-
-        chunks, images = naive_merge_with_images(
-            sections,
-            section_images,
-            self._param.chunk_token_size,
-            deli,
-            self._param.overlapped_percent,
-        )
-        cks = [
-            {
-                "text": RAGFlowPdfParser.remove_tag(c),
-                "image": img,
-                "positions": [[pos[0][-1]+1, *pos[1:]] for pos in RAGFlowPdfParser.extract_positions(c)],
-            }
-            for c, img in zip(chunks, images) if c.strip()
-        ]
-        async with trio.open_nursery() as nursery:
-            for d in cks:
-                nursery.start_soon(image2id, d, partial(STORAGE_IMPL.put), get_uuid())
-        self.set_output("chunks",  cks)
-        self.callback(1, "Done.")
--- a/rag/flow/tests/client.py
+++ b/rag/flow/tests/client.py
@ -30,7 +30,7 @@ def print_logs(pipeline: Pipeline):
    while True:
        time.sleep(5)
        logs = pipeline.fetch_logs()
-        logs_str = json.dumps(logs, ensure_ascii=False)
+        logs_str = json.dumps(logs)
        if logs_str != last_logs:
            print(logs_str)
        last_logs = logs_str
--- a/rag/flow/tests/dsl_examples/general_pdf_all.json
+++ b/rag/flow/tests/dsl_examples/general_pdf_all.json
@ -38,13 +38,6 @@
                  ],
                  "output_format": "json"
                },
-                "slides": {
-                    "parse_method": "presentation",
-                    "suffix": [
-                        "pptx"
-                    ],
-                    "output_format": "json"
-                },
                "markdown": {
                  "suffix": [
                    "md",
@ -89,36 +82,19 @@
                  "lang": "Chinese",
                  "llm_id": "SenseVoiceSmall",
                  "output_format": "json"
-                },
-                "email": {
-                  "suffix": [
-                    "msg"
-                  ],
-                  "fields": [
-                    "from",
-                    "to",
-                    "cc",
-                    "bcc",
-                    "date",
-                    "subject",
-                    "body",
-                    "attachments"
-                  ],
-                  "output_format": "json"
                }
              }
          }
        },
-        "downstream": ["Splitter:0"],
+        "downstream": ["Chunker:0"],
        "upstream": ["Begin"]
    },
-    "Splitter:0": {
+    "Chunker:0": {
        "obj": {
-            "component_name": "Splitter",
+            "component_name": "Chunker",
            "params": {
-              "chunk_token_size": 512,
-              "delimiters": ["\n"],
-              "overlapped_percent": 0
+              "method": "general",
+              "auto_keywords": 5
            }
        },
        "downstream": ["Tokenizer:0"],
--- a/rag/flow/tests/dsl_examples/hierarchical_merger.json
+++ b/rag/flow/tests/dsl_examples/hierarchical_merger.json
@ -1,84 +0,0 @@
-{
-  "components": {
-    "File": {
-        "obj":{
-            "component_name": "File",
-            "params": {
-            }
-        },
-        "downstream": ["Parser:0"],
-        "upstream": []
-    },
-    "Parser:0": {
-        "obj": {
-            "component_name": "Parser",
-            "params": {
-              "setups": {
-                "pdf": {
-                  "parse_method": "deepdoc",
-                  "vlm_name": "",
-                  "lang": "Chinese",
-                  "suffix": [
-                    "pdf"
-                  ],
-                  "output_format": "json"
-                },
-                "spreadsheet": {
-                  "suffix": [
-                    "xls",
-                    "xlsx",
-                    "csv"
-                  ],
-                  "output_format": "html"
-                },
-                "word": {
-                  "suffix": [
-                    "doc",
-                    "docx"
-                  ],
-                  "output_format": "json"
-                },
-                "markdown": {
-                  "suffix": [
-                    "md",
-                    "markdown"
-                  ],
-                  "output_format": "text"
-                },
-                "text": {
-                  "suffix": ["txt"],
-                  "output_format": "json"
-                }
-              }
-          }
-        },
-        "downstream": ["Splitter:0"],
-        "upstream": ["File"]
-    },
-    "Splitter:0": {
-        "obj": {
-            "component_name": "Splitter",
-            "params": {
-              "chunk_token_size": 512,
-              "delimiters": ["\r\n"],
-              "overlapped_percent": 0
-            }
-        },
-        "downstream": ["HierarchicalMerger:0"],
-        "upstream": ["Parser:0"]
-    },
-    "HierarchicalMerger:0": {
-        "obj": {
-            "component_name": "HierarchicalMerger",
-            "params": {
-              "levels": [["^#[^#]"], ["^##[^#]"], ["^###[^#]"], ["^####[^#]"]],
-              "hierarchy": 2
-            }
-        },
-        "downstream": [],
-        "upstream": ["Splitter:0"]
-    }
-  },
-  "path": []
-}
-
--- a/rag/flow/tokenizer/schema.py
+++ b/rag/flow/tokenizer/schema.py
@ -22,16 +22,16 @@ class TokenizerFromUpstream(BaseModel):
    elapsed_time: float | None = Field(default=None, alias="_elapsed_time")

    name: str = ""
-    file: dict | None = Field(default=None)
+    blob: bytes

-    output_format: Literal["json", "markdown", "text", "html", "chunks"] | None = Field(default=None)
+    output_format: Literal["json", "markdown", "text", "html"] | None = Field(default=None)

    chunks: list[dict[str, Any]] | None = Field(default=None)

    json_result: list[dict[str, Any]] | None = Field(default=None, alias="json")
    markdown_result: str | None = Field(default=None, alias="markdown")
    text_result: str | None = Field(default=None, alias="text")
-    html_result: str | None = Field(default=None, alias="html")
+    html_result: list[str] | None = Field(default=None, alias="html")

    model_config = ConfigDict(populate_by_name=True, extra="forbid")

@ -40,14 +40,12 @@ class TokenizerFromUpstream(BaseModel):
        if self.chunks:
            return self

-        if self.output_format in {"markdown", "text", "html"}:
+        if self.output_format in {"markdown", "text"}:
            if self.output_format == "markdown" and not self.markdown_result:
                raise ValueError("output_format=markdown requires a markdown payload (field: 'markdown' or 'markdown_result').")
            if self.output_format == "text" and not self.text_result:
                raise ValueError("output_format=text requires a text payload (field: 'text' or 'text_result').")
-            if self.output_format == "html" and not self.html_result:
-                raise ValueError("output_format=text requires a html payload (field: 'html' or 'html_result').")
        else:
-            if not self.json_result and not self.chunks:
+            if not self.json_result:
                raise ValueError("When no chunks are provided and output_format is not markdown/text, a JSON list payload is required (field: 'json' or 'json_result').")
        return self
--- a/rag/flow/tokenizer/tokenizer.py
+++ b/rag/flow/tokenizer/tokenizer.py
@ -37,7 +37,6 @@ class TokenizerParam(ProcessParamBase):
        super().__init__()
        self.search_method = ["full_text", "embedding"]
        self.filename_embd_weight = 0.1
-        self.fields = ["text"]

    def check(self):
        for v in self.search_method:
@ -62,14 +61,10 @@ class Tokenizer(ProcessBase):
        embedding_model = LLMBundle(self._canvas._tenant_id, LLMType.EMBEDDING, llm_name=embedding_id)
        texts = []
        for c in chunks:
-            txt = ""
-            for f in self._param.fields:
-                f = c.get(f)
-                if isinstance(f, str):
-                    txt += f
-                elif isinstance(f, list):
-                    txt += "\n".join(f)
-            texts.append(re.sub(r"</?(table|td|caption|tr|th)( [^<>]{0,12})?>", " ", txt))
+            if c.get("questions"):
+                texts.append("\n".join(c["questions"]))
+            else:
+                texts.append(re.sub(r"</?(table|td|caption|tr|th)( [^<>]{0,12})?>", " ", c["text"]))
        vts, c = embedding_model.encode([name])
        token_count += c
        tts = np.concatenate([vts[0] for _ in range(len(texts))], axis=0)
@ -108,36 +103,26 @@ class Tokenizer(ProcessBase):
            self.set_output("_ERROR", f"Input error: {str(e)}")
            return

-        self.set_output("output_format", "chunks")
        parts = sum(["full_text" in self._param.search_method, "embedding" in self._param.search_method])
        if "full_text" in self._param.search_method:
            self.callback(random.randint(1, 5) / 100.0, "Start to tokenize.")
            if from_upstream.chunks:
                chunks = from_upstream.chunks
                for i, ck in enumerate(chunks):
-                    ck["title_tks"] = rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", from_upstream.name))
-                    ck["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(ck["title_tks"])
                    if ck.get("questions"):
-                        ck["question_kwd"] = ck["questions"].split("\n")
-                        ck["question_tks"] = rag_tokenizer.tokenize(str(ck["questions"]))
+                        ck["question_tks"] = rag_tokenizer.tokenize("\n".join(ck["questions"]))
                    if ck.get("keywords"):
-                        ck["important_kwd"] = ck["keywords"].split(",")
-                        ck["important_tks"] = rag_tokenizer.tokenize(str(ck["keywords"]))
-                    if ck.get("summary"):
-                        ck["content_ltks"] = rag_tokenizer.tokenize(str(ck["summary"]))
-                        ck["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(ck["content_ltks"])
-                    else:
-                        ck["content_ltks"] = rag_tokenizer.tokenize(ck["text"])
-                        ck["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(ck["content_ltks"])
+                        ck["important_tks"] = rag_tokenizer.tokenize("\n".join(ck["keywords"]))
+                    ck["content_ltks"] = rag_tokenizer.tokenize(ck["text"])
+                    ck["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(ck["content_ltks"])
                    if i % 100 == 99:
                        self.callback(i * 1.0 / len(chunks) / parts)
-
            elif from_upstream.output_format in ["markdown", "text", "html"]:
                if from_upstream.output_format == "markdown":
                    payload = from_upstream.markdown_result
                elif from_upstream.output_format == "text":
                    payload = from_upstream.text_result
-                else:
+                else:  # == "html"
                    payload = from_upstream.html_result

                if not payload:
@ -145,16 +130,12 @@ class Tokenizer(ProcessBase):

                ck = {"text": payload}
                if "full_text" in self._param.search_method:
-                    ck["title_tks"] = rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", from_upstream.name))
-                    ck["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(ck["title_tks"])
-                    ck["content_ltks"] = rag_tokenizer.tokenize(payload)
+                    ck["content_ltks"] = rag_tokenizer.tokenize(kwargs.get(kwargs["output_format"], ""))
                    ck["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(ck["content_ltks"])
                chunks = [ck]
            else:
                chunks = from_upstream.json_result
                for i, ck in enumerate(chunks):
-                    ck["title_tks"] = rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", from_upstream.name))
-                    ck["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(ck["title_tks"])
                    ck["content_ltks"] = rag_tokenizer.tokenize(ck["text"])
                    ck["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(ck["content_ltks"])
                    if i % 100 == 99:
--- a/rag/llm/chat_model.py
+++ b/rag/llm/chat_model.py
@ -146,7 +146,7 @@ class Base(ABC):

        response = self.client.chat.completions.create(model=self.model_name, messages=history, **gen_conf, **kwargs)

-        if not response.choices or not response.choices[0].message or not response.choices[0].message.content:
+        if (not response.choices or not response.choices[0].message or not response.choices[0].message.content):
            return "", 0
        ans = response.choices[0].message.content.strip()
        if response.choices[0].finish_reason == "length":
@ -457,7 +457,7 @@ class Base(ABC):
        yield total_tokens

    def total_token_count(self, resp):
-        return total_token_count_from_response(resp)
+       return total_token_count_from_response(resp)

    def _calculate_dynamic_ctx(self, history):
        """Calculate dynamic context window size"""
@ -1305,6 +1305,10 @@ class LiteLLMBase(ABC):
        "302.AI",
    ]

+    import litellm
+
+    litellm._turn_on_debug()
+
    def __init__(self, key, model_name, base_url=None, **kwargs):
        self.timeout = int(os.environ.get("LM_TIMEOUT_SECONDS", 600))
        self.provider = kwargs.get("provider", "")
--- a/rag/llm/embedding_model.py
+++ b/rag/llm/embedding_model.py
@ -138,7 +138,7 @@ class OpenAIEmbed(Base):
        ress = []
        total_tokens = 0
        for i in range(0, len(texts), batch_size):
-            res = self.client.embeddings.create(input=texts[i : i + batch_size], model=self.model_name, encoding_format="float", extra_body={"drop_params": True})
+            res = self.client.embeddings.create(input=texts[i : i + batch_size], model=self.model_name, encoding_format="float")
            try:
                ress.extend([d.embedding for d in res.data])
                total_tokens += self.total_token_count(res)
@ -147,7 +147,7 @@ class OpenAIEmbed(Base):
        return np.array(ress), total_tokens

    def encode_queries(self, text):
-        res = self.client.embeddings.create(input=[truncate(text, 8191)], model=self.model_name, encoding_format="float",extra_body={"drop_params": True})
+        res = self.client.embeddings.create(input=[truncate(text, 8191)], model=self.model_name, encoding_format="float")
        return np.array(res.data[0].embedding), self.total_token_count(res)


@ -489,6 +489,7 @@ class MistralEmbed(Base):
    def encode_queries(self, text):
        import time
        import random
+
        retry_max = 5
        while retry_max > 0:
            try:
@ -747,7 +748,7 @@ class SILICONFLOWEmbed(Base):
            texts_batch = texts[i : i + batch_size]
            if self.model_name in ["BAAI/bge-large-zh-v1.5", "BAAI/bge-large-en-v1.5"]:
                # limit 512, 340 is almost safe
-                texts_batch = [" " if not text.strip() else truncate(text, 256) for text in texts_batch]
+                texts_batch = [" " if not text.strip() else truncate(text, 340) for text in texts_batch]
            else:
                texts_batch = [" " if not text.strip() else text for text in texts_batch]

@ -955,7 +956,7 @@ class Ai302Embed(Base):
        super().__init__(key, model_name, base_url)


-class CometAPIEmbed(OpenAIEmbed):
+class CometEmbed(OpenAIEmbed):
    _FACTORY_NAME = "CometAPI"

    def __init__(self, key, model_name, base_url="https://api.cometapi.com/v1"):
--- a/rag/llm/sequence2txt_model.py
+++ b/rag/llm/sequence2txt_model.py
@ -236,7 +236,7 @@ class DeepInfraSeq2txt(Base):
        self.model_name = model_name
        
        
-class CometAPISeq2txt(Base):
+class CometSeq2txt(Base):
    _FACTORY_NAME = "CometAPI"

    def __init__(self, key, model_name="whisper-1", base_url="https://api.cometapi.com/v1", **kwargs):
--- a/rag/nlp/init.py
+++ b/rag/nlp/init.py
@ -292,7 +292,6 @@ def tokenize_chunks(chunks, doc, eng, pdf_parser=None):
        res.append(d)
    return res

-
 def tokenize_chunks_with_images(chunks, doc, eng, images):
    res = []
    # wrap up as es documents
@ -307,7 +306,6 @@ def tokenize_chunks_with_images(chunks, doc, eng, images):
        res.append(d)
    return res

-
 def tokenize_table(tbls, doc, eng, batch_size=10):
    res = []
    # add tables
@ -581,9 +579,7 @@ def naive_merge(sections: str | list, chunk_token_num=128, delimiter="\n。；
    from deepdoc.parser.pdf_parser import RAGFlowPdfParser
    if not sections:
        return []
-    if isinstance(sections, str):
-        sections = [sections]
-    if isinstance(sections[0], str):
+    if isinstance(sections[0], type("")):
        sections = [(s, "") for s in sections]
    cks = [""]
    tk_nums = [0]
--- a/rag/nlp/search.py
+++ b/rag/nlp/search.py
@ -383,7 +383,7 @@ class Dealer:
        vector_column = f"q_{dim}_vec"
        zero_vector = [0.0] * dim
        sim_np = np.array(sim)
-        filtered_count = (sim_np >= similarity_threshold).sum()
+        filtered_count = (sim_np >= similarity_threshold).sum()    
        ranks["total"] = int(filtered_count) # Convert from np.int64 to Python int otherwise JSON serializable error
        for i in idx:
            if sim[i] < similarity_threshold:
@ -444,27 +444,12 @@ class Dealer:
    def chunk_list(self, doc_id: str, tenant_id: str,
                   kb_ids: list[str], max_count=1024,
                   offset=0,
-                   fields=["docnm_kwd", "content_with_weight", "img_id"],
-                   sort_by_position: bool = False):
+                   fields=["docnm_kwd", "content_with_weight", "img_id"]):
        condition = {"doc_id": doc_id}
-
-        fields_set = set(fields or [])
-        if sort_by_position:
-            for need in ("page_num_int", "position_int", "top_int"):
-                if need not in fields_set:
-                    fields_set.add(need)
-        fields = list(fields_set)
-
-        orderBy = OrderByExpr()
-        if sort_by_position:
-            orderBy.asc("page_num_int")
-            orderBy.asc("position_int")
-            orderBy.asc("top_int")
-
        res = []
        bs = 128
        for p in range(offset, max_count, bs):
-            es_res = self.dataStore.search(fields, [], condition, [], orderBy, p, bs, index_name(tenant_id),
+            es_res = self.dataStore.search(fields, [], condition, [], OrderByExpr(), p, bs, index_name(tenant_id),
                                           kb_ids)
            dict_chunks = self.dataStore.getFields(es_res, fields)
            for id, doc in dict_chunks.items():
--- a/rag/prompts/generator.py
+++ b/rag/prompts/generator.py
@ -436,217 +436,4 @@ def gen_meta_filter(chat_mdl, meta_data:dict, query: str) -> list:
        return ans
    except Exception:
        logging.exception(f"Loading json failure: {ans}")
-    return []
-
-
-def gen_json(system_prompt:str, user_prompt:str, chat_mdl):
-    _, msg = message_fit_in(form_message(system_prompt, user_prompt), chat_mdl.max_length)
-    ans = chat_mdl.chat(msg[0]["content"], msg[1:])
-    ans = re.sub(r"(^.*</think>|```json\n|```\n*$)", "", ans, flags=re.DOTALL)
-    try:
-        return json_repair.loads(ans)
-    except Exception:
-        logging.exception(f"Loading json failure: {ans}")
-
-
-TOC_DETECTION = load_prompt("toc_detection")
-def detect_table_of_contents(page_1024:list[str], chat_mdl):
-    toc_secs = []
-    for i, sec in enumerate(page_1024[:22]):
-        ans = gen_json(PROMPT_JINJA_ENV.from_string(TOC_DETECTION).render(page_txt=sec), "Only JSON please.", chat_mdl)
-        if toc_secs and not ans["exists"]:
-            break
-        toc_secs.append(sec)
-    return toc_secs
-
-
-TOC_EXTRACTION = load_prompt("toc_extraction")
-TOC_EXTRACTION_CONTINUE = load_prompt("toc_extraction_continue")
-def extract_table_of_contents(toc_pages, chat_mdl):
-    if not toc_pages:
-        return []
-
-    return gen_json(PROMPT_JINJA_ENV.from_string(TOC_EXTRACTION).render(toc_page="\n".join(toc_pages)), "Only JSON please.", chat_mdl)
-
-
-def toc_index_extractor(toc:list[dict], content:str, chat_mdl):
-    tob_extractor_prompt = """
-    You are given a table of contents in a json format and several pages of a document, your job is to add the physical_index to the table of contents in the json format.
-
-    The provided pages contains tags like <physical_index_X> and <physical_index_X> to indicate the physical location of the page X.
-
-    The structure variable is the numeric system which represents the index of the hierarchy section in the table of contents. For example, the first section has structure index 1, the first subsection has structure index 1.1, the second subsection has structure index 1.2, etc.
-
-    The response should be in the following JSON format: 
-    [
-        {
-            "structure": <structure index, "x.x.x" or None> (string),
-            "title": <title of the section>,
-            "physical_index": "<physical_index_X>" (keep the format)
-        },
-        ...
-    ]
-
-    Only add the physical_index to the sections that are in the provided pages.
-    If the title of the section are not in the provided pages, do not add the physical_index to it.
-    Directly return the final JSON structure. Do not output anything else."""
-
-    prompt = tob_extractor_prompt + '\nTable of contents:\n' + json.dumps(toc, ensure_ascii=False, indent=2) + '\nDocument pages:\n' + content
-    return gen_json(prompt, "Only JSON please.", chat_mdl)
-
-
-TOC_INDEX = load_prompt("toc_index")
-def table_of_contents_index(toc_arr: list[dict], sections: list[str], chat_mdl):
-    if not toc_arr or not sections:
-        return []
-
-    toc_map = {}
-    for i, it in enumerate(toc_arr):
-        k1 = (it["structure"]+it["title"]).replace(" ", "")
-        k2 = it["title"].strip()
-        if k1 not in toc_map:
-            toc_map[k1] = []
-        if k2 not in toc_map:
-            toc_map[k2] = []
-        toc_map[k1].append(i)
-        toc_map[k2].append(i)
-
-    for it in toc_arr:
-        it["indices"] = []
-    for i, sec in enumerate(sections):
-        sec = sec.strip()
-        if sec.replace(" ", "") in toc_map:
-            for j in toc_map[sec.replace(" ", "")]:
-                toc_arr[j]["indices"].append(i)
-
-    all_pathes = []
-    def dfs(start, path):
-        nonlocal all_pathes
-        if start >= len(toc_arr):
-            if path:
-                all_pathes.append(path)
-            return
-        if not toc_arr[start]["indices"]:
-            dfs(start+1, path)
-            return
-        added = False
-        for j in toc_arr[start]["indices"]:
-            if path and j < path[-1][0]:
-                continue
-            _path = deepcopy(path)
-            _path.append((j, start))
-            added = True
-            dfs(start+1, _path)
-        if not added and path:
-            all_pathes.append(path)
-
-    dfs(0, [])
-    path = max(all_pathes, key=lambda x:len(x))
-    for it in toc_arr:
-        it["indices"] = []
-    for j, i in path:
-        toc_arr[i]["indices"] = [j]
-    print(json.dumps(toc_arr, ensure_ascii=False, indent=2))
-
-    i = 0
-    while i < len(toc_arr):
-        it  = toc_arr[i]
-        if it["indices"]:
-            i += 1
-            continue
-
-        if i>0 and toc_arr[i-1]["indices"]:
-            st_i = toc_arr[i-1]["indices"][-1]
-        else:
-            st_i = 0
-        e = i + 1
-        while e <len(toc_arr) and not toc_arr[e]["indices"]:
-            e += 1
-        if e >= len(toc_arr):
-            e = len(sections)
-        else:
-            e = toc_arr[e]["indices"][0]
-
-        for j in range(st_i, min(e+1, len(sections))):
-            ans = gen_json(PROMPT_JINJA_ENV.from_string(TOC_INDEX).render(
-                structure=it["structure"],
-                title=it["title"],
-                text=sections[j]), "Only JSON please.", chat_mdl)
-            if ans["exist"] == "yes":
-                it["indices"].append(j)
-                break
-
-        i += 1
-
-    return toc_arr
-
-
-def check_if_toc_transformation_is_complete(content, toc, chat_mdl):
-    prompt = """
-    You are given a raw table of contents and a  table of contents.
-    Your job is to check if the  table of contents is complete.
-
-    Reply format:
-    {{
-        "thinking": <why do you think the cleaned table of contents is complete or not>
-        "completed": "yes" or "no"
-    }}
-    Directly return the final JSON structure. Do not output anything else."""
-
-    prompt = prompt + '\n Raw Table of contents:\n' + content + '\n Cleaned Table of contents:\n' + toc
-    response = gen_json(prompt, "Only JSON please.", chat_mdl)
-    return response['completed']
-
-
-def toc_transformer(toc_pages, chat_mdl):
-    init_prompt = """
-    You are given a table of contents, You job is to transform the whole table of content into a JSON format included table_of_contents.
-
-    The `structure` is the numeric system which represents the index of the hierarchy section in the table of contents. For example, the first section has structure index 1, the first subsection has structure index 1.1, the second subsection has structure index 1.2, etc.
-    The `title` is a short phrase or a several-words term.
-    
-    The response should be in the following JSON format: 
-    [
-        {
-            "structure": <structure index, "x.x.x" or None> (string),
-            "title": <title of the section>
-        },
-        ...
-    ],
-    You should transform the full table of contents in one go.
-    Directly return the final JSON structure, do not output anything else. """
-
-    toc_content = "\n".join(toc_pages)
-    prompt = init_prompt + '\n Given table of contents\n:' + toc_content
-    def clean_toc(arr):
-        for a in arr:
-            a["title"] = re.sub(r"[.·….]{2,}", "", a["title"])
-    last_complete = gen_json(prompt, "Only JSON please.", chat_mdl)
-    if_complete = check_if_toc_transformation_is_complete(toc_content, json.dumps(last_complete, ensure_ascii=False, indent=2), chat_mdl)
-    clean_toc(last_complete)
-    if if_complete == "yes":
-        return last_complete
-
-    while not (if_complete == "yes"):
-        prompt = f"""
-        Your task is to continue the table of contents json structure, directly output the remaining part of the json structure.
-        The response should be in the following JSON format: 
-
-        The raw table of contents json structure is:
-        {toc_content}
-
-        The incomplete transformed table of contents json structure is:
-        {json.dumps(last_complete[-24:], ensure_ascii=False, indent=2)}
-
-        Please continue the json structure, directly output the remaining part of the json structure."""
-        new_complete = gen_json(prompt, "Only JSON please.", chat_mdl)
-        if not new_complete or str(last_complete).find(str(new_complete)) >= 0:
-            break
-        clean_toc(new_complete)
-        last_complete.extend(new_complete)
-        if_complete = check_if_toc_transformation_is_complete(toc_content, json.dumps(last_complete, ensure_ascii=False, indent=2), chat_mdl)
-
-    return last_complete
-
-
-
+    return []
--- a/rag/prompts/toc_detection.md
+++ b/rag/prompts/toc_detection.md
@ -1,29 +0,0 @@
-You are an AI assistant designed to analyze text content and detect whether a table of contents (TOC) list exists on the given page. Follow these steps:  
-
-1. **Analyze the Input**: Carefully review the provided text content.  
-2. **Identify Key Features**: Look for common indicators of a TOC, such as:  
-   - Section titles or headings paired with page numbers.
-   - Patterns like repeated formatting (e.g., bold/italicized text, dots/dashes between titles and numbers).  
-   - Phrases like "Table of Contents," "Contents," or similar headings.  
-   - Logical grouping of topics/subtopics with sequential page references.  
-3. **Discern Negative  Features**:
-   - The text contains no numbers, or the numbers present are clearly not page references (e.g., dates, statistical figures, phone numbers, version numbers).
-   - The text consists of full, descriptive sentences and paragraphs that form a narrative, present arguments, or explain concepts, rather than succinctly listing topics.
-   - Contains citations with authors, publication years, journal titles, and page ranges (e.g., "Smith, J. (2020). Journal Title, 10(2), 45-67.").
-   - Lists keywords or terms followed by multiple page numbers, often in alphabetical order.
-   - Comprises terms followed by their definitions or explanations.
-   - Labeled with headers like "Appendix A," "Appendix B," etc.
-   - Contains expressive language thanking individuals or organizations for their support or contributions.
-4. **Evaluate Evidence**: Weigh the presence/absence of these features to determine if the content resembles a TOC.
-5. **Output Format**: Provide your response in the following JSON structure:  
-   ```json  
-   {  
-     "reasoning": "Step-by-step explanation of your analysis based on the features identified." ,
-     "exists": true/false
-   }  
-   ```  
-6. **DO NOT** output anything else except JSON structure.
-
-**Input text Content ( Text-Only Extraction ):**  
-{{ page_txt }} 
-
--- a/rag/prompts/toc_extraction.md
+++ b/rag/prompts/toc_extraction.md
@ -1,53 +0,0 @@
-You are an expert parser and data formatter. Your task is to analyze the provided table of contents (TOC) text and convert it into a valid JSON array of objects.
-
-**Instructions:**
-1.  Analyze each line of the input TOC.
-2.  For each line, extract the following three pieces of information:
-    *   `structure`: The hierarchical index/numbering (e.g., "1", "2.1", "3.2.5", "A.1"). If a line has no visible numbering or structure indicator (like a main "Chapter" title), use `null`.
-    *   `title`: The textual title of the section or chapter. This should be the main descriptive text, clean and without the page number.
-3.  Output **only** a valid JSON array. Do not include any other text, explanations, or markdown code block fences (like ```json) in your response.
-
-**JSON Format:**
-The output must be a list of objects following this exact schema:
-```json
-[
-    {
-        "structure": <structure index, "x.x.x" or None> (string）,
-        "title": <title of the section>
-    },
-    ...
-]
-```
-
-**Input Example:**
-```
-Contents
-1 Introduction to the System ... 1
-1.1 Overview .... 2
-1.2 Key Features .... 5
-2 Installation Guide ....8
-2.1 Prerequisites ........ 9
-2.2 Step-by-Step Process ........ 12
-Appendix A: Specifications ..... 45
-References ... 47
-```
-
-**Expected Output For The Example:**
-```json
-[
-    {"structure": null, "title": "Contents"},
-    {"structure": "1", "title": "Introduction to the System"},
-    {"structure": "1.1", "title": "Overview"},
-    {"structure": "1.2", "title": "Key Features"},
-    {"structure": "2", "title": "Installation Guide"},
-    {"structure": "2.1", "title": "Prerequisites"},
-    {"structure": "2.2", "title": "Step-by-Step Process"},
-    {"structure": "A", "title": "Specifications"},
-    {"structure": null, "title": "References"}
-]
-```
-
-**Now, process the following TOC input:**
-```
-{{ toc_page }}
-```
--- a/rag/prompts/toc_extraction_continue.md
+++ b/rag/prompts/toc_extraction_continue.md
@ -1,60 +0,0 @@
-You are an expert parser and data formatter, currently in the process of building a JSON array from a multi-page table of contents (TOC). Your task is to analyze the new page of content and **append** the new entries to the existing JSON array.
-
-**Instructions:**
-1.  You will be given two inputs:
-    *   `current_page_text`: The text content from the new page of the TOC.
-    *   `existing_json`: The valid JSON array you have generated from the previous pages.
-2.  Analyze each line of the `current_page_text` input.
-3.  For each new line, extract the following three pieces of information:
-    *   `structure`: The hierarchical index/numbering (e.g., "1", "2.1", "3.2.5"). Use `null` if none exists.
-    *   `title`: The clean textual title of the section or chapter.
-    *   `page`: The page number on which the section starts. Extract only the number. Use `null` if not present.
-4.  **Append these new entries** to the `existing_json` array. Do not modify, reorder, or delete any of the existing entries.
-5.  Output **only** the complete, updated JSON array. Do not include any other text, explanations, or markdown code block fences (like ```json).
-
-**JSON Format:**
-The output must be a valid JSON array following this schema:
-```json
-[
-    {
-        "structure": <string or null>,
-        "title": <string>,
-        "page": <number or null>
-    },
-    ...
-]
-```
-
-**Input Example:**
-`current_page_text`:
-```
-3.2 Advanced Configuration ........... 25
-3.3 Troubleshooting .................. 28
-4 User Management .................... 30
-```
-
-`existing_json`:
-```json
-[
-    {"structure": "1", "title": "Introduction", "page": 1},
-    {"structure": "2", "title": "Installation", "page": 5},
-    {"structure": "3", "title": "Configuration", "page": 12},
-    {"structure": "3.1", "title": "Basic Setup", "page": 15}
-]
-```
-
-**Expected Output For The Example:**
-```json
-[
-    {"structure": "3.2", "title": "Advanced Configuration", "page": 25},
-    {"structure": "3.3", "title": "Troubleshooting", "page": 28},
-    {"structure": "4", "title": "User Management", "page": 30}
-]
-```
-
-**Now, process the following inputs:**
-`current_page_text`:
-{{ toc_page }}
-
-`existing_json`:
-{{ toc_json }}
--- a/rag/prompts/toc_index.md
+++ b/rag/prompts/toc_index.md
@ -1,20 +0,0 @@
-You are an expert analyst tasked with matching text content to the title.
-
-**Instructions:**
-1. Analyze the given title with its numeric structure index and the provided text.
-2. Determine whether the title is mentioned as a section tile in the given text.
-3. Provide a concise, step-by-step reasoning for your decision.
-4. Output **only** the complete JSON object. Do not include any other text, explanations, or markdown code block fences (like ```json).
-
-**Output Format:**
-Your output must be a valid JSON object with the following keys:
-{
-"reasoning": "Step-by-step explanation of your analysis.",
-"exist": "<yes or no>",
-}
-
-** The title: **
-{{ structure }} {{ title }}
-
-** Given text: **
-{{ text }}
--- a/rag/svr/task_executor.py
+++ b/rag/svr/task_executor.py
@ -21,18 +21,14 @@ import sys
 import threading
 import time

-import json_repair
-
-from api.db.services.canvas_service import UserCanvasService
-from api.db.services.knowledgebase_service import KnowledgebaseService
-from api.db.services.pipeline_operation_log_service import PipelineOperationLogService
+from api.utils import get_uuid
 from api.utils.api_utils import timeout
-from api.utils.base64_image import image2id
 from api.utils.log_utils import init_root_logger, get_project_base_directory
-from graphrag.general.index import run_graphrag_for_kb
+from graphrag.general.index import run_graphrag
 from graphrag.utils import get_llm_cache, set_llm_cache, get_tags_from_cache, set_tags_to_cache
 from rag.flow.pipeline import Pipeline
-from rag.prompts import keyword_extraction, question_proposal, content_tagging
+from rag.prompts.generator import keyword_extraction, question_proposal, content_tagging
+
 import logging
 import os
 from datetime import datetime
@ -41,6 +37,7 @@ import xxhash
 import copy
 import re
 from functools import partial
+from io import BytesIO
 from multiprocessing.context import TimeoutError
 from timeit import default_timer as timer
 import tracemalloc
@ -48,19 +45,21 @@ import signal
 import trio
 import exceptiongroup
 import faulthandler
+
 import numpy as np
 from peewee import DoesNotExist
-from api.db import LLMType, ParserType, PipelineTaskType
+
+from api.db import LLMType, ParserType
 from api.db.services.document_service import DocumentService
 from api.db.services.llm_service import LLMBundle
-from api.db.services.task_service import TaskService, has_canceled, CANVAS_DEBUG_DOC_ID, GRAPH_RAPTOR_FAKE_DOC_ID
+from api.db.services.task_service import TaskService, has_canceled
 from api.db.services.file2document_service import File2DocumentService
 from api import settings
 from api.versions import get_ragflow_version
 from api.db.db_models import close_connection
 from rag.app import laws, paper, presentation, manual, qa, table, book, resume, picture, naive, one, audio, \
    email, tag
-from rag.nlp import search, rag_tokenizer, add_positions
+from rag.nlp import search, rag_tokenizer
 from rag.raptor import RecursiveAbstractiveProcessing4TreeOrganizedRetrieval as Raptor
 from rag.settings import DOC_MAXIMUM_SIZE, DOC_BULK_SIZE, EMBEDDING_BATCH_SIZE, SVR_CONSUMER_GROUP_NAME, get_svr_queue_name, get_svr_queue_names, print_rag_settings, TAG_FLD, PAGERANK_FLD
 from rag.utils import num_tokens_from_string, truncate
@ -89,13 +88,6 @@ FACTORY = {
    ParserType.TAG.value: tag
 }

-TASK_TYPE_TO_PIPELINE_TASK_TYPE = {
-    "dataflow" : PipelineTaskType.PARSE,
-    "raptor": PipelineTaskType.RAPTOR,
-    "graphrag": PipelineTaskType.GRAPH_RAG,
-    "mindmap": PipelineTaskType.MINDMAP,
-}
-
 UNACKED_ITERATOR = None

 CONSUMER_NO = "0" if len(sys.argv) < 2 else sys.argv[1]
@ -151,7 +143,6 @@ def start_tracemalloc_and_snapshot(signum, frame):
        max_rss = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
    logging.info(f"taken snapshot {snapshot_file}. max RSS={max_rss / 1000:.2f} MB, current memory usage: {current / 10**6:.2f} MB, Peak memory usage: {peak / 10**6:.2f} MB")

-
 # SIGUSR2 handler: stop tracemalloc
 def stop_tracemalloc(signum, frame):
    if tracemalloc.is_tracing():
@ -160,7 +151,6 @@ def stop_tracemalloc(signum, frame):
    else:
        logging.info("tracemalloc not running")

-
 class TaskCanceledException(Exception):
    def __init__(self, msg):
        self.msg = msg
@ -226,14 +216,7 @@ async def collect():
        return None, None

    canceled = False
-    if msg.get("doc_id", "") in [GRAPH_RAPTOR_FAKE_DOC_ID, CANVAS_DEBUG_DOC_ID]:
-        task = msg
-        if task["task_type"] in ["graphrag", "raptor", "mindmap"] and msg.get("doc_ids", []):
-            task = TaskService.get_task(msg["id"], msg["doc_ids"])
-            task["doc_ids"] = msg["doc_ids"]
-    else:
-        task = TaskService.get_task(msg["id"])
-
+    task = TaskService.get_task(msg["id"])
    if task:
        canceled = has_canceled(task["id"])
    if not task or canceled:
@ -245,9 +228,10 @@ async def collect():

    task_type = msg.get("task_type", "")
    task["task_type"] = task_type
-    if task_type[:8] == "dataflow":
-        task["tenant_id"] = msg["tenant_id"]
-        task["dataflow_id"] = msg["dataflow_id"]
+    if task_type == "dataflow":
+        task["tenant_id"]=msg.get("tenant_id", "")
+        task["dsl"] = msg.get("dsl", "")
+        task["dataflow_id"] = msg.get("dataflow_id", get_uuid())
        task["kb_id"] = msg.get("kb_id", "")
    return redis_msg, task

@ -317,8 +301,30 @@ async def build_chunks(task, progress_callback):
                d["img_id"] = ""
                docs.append(d)
                return
-            await image2id(d, partial(STORAGE_IMPL.put), d["id"], task["kb_id"])
-            docs.append(d)
+
+            with BytesIO() as output_buffer:
+                if isinstance(d["image"], bytes):
+                    output_buffer.write(d["image"])
+                    output_buffer.seek(0)
+                else:
+                    # If the image is in RGBA mode, convert it to RGB mode before saving it in JPEG format.
+                    if d["image"].mode in ("RGBA", "P"):
+                        converted_image = d["image"].convert("RGB")
+                        #d["image"].close()  # Close original image
+                        d["image"] = converted_image
+                    try:
+                        d["image"].save(output_buffer, format='JPEG')
+                    except OSError as e:
+                        logging.warning(
+                            "Saving image of chunk {}/{}/{} got exception, ignore: {}".format(task["location"], task["name"], d["id"], str(e)))
+
+                async with minio_limiter:
+                    await trio.to_thread.run_sync(lambda: STORAGE_IMPL.put(task["kb_id"], d["id"], output_buffer.getvalue()))
+                d["img_id"] = "{}-{}".format(task["kb_id"], d["id"])
+                if not isinstance(d["image"], bytes):
+                    d["image"].close()
+                del d["image"]  # Remove image reference
+                docs.append(d)
        except Exception:
            logging.exception(
                "Saving image of chunk {}/{}/{} got exception".format(task["location"], task["name"], d["id"]))
@ -476,192 +482,35 @@ async def embedding(docs, mdl, parser_config=None, callback=None):
    return tk_count, vector_size


-async def run_dataflow(task: dict):
-    task_start_ts = timer()
-    dataflow_id = task["dataflow_id"]
-    doc_id = task["doc_id"]
-    task_id = task["id"]
-    task_dataset_id = task["kb_id"]
+async def run_dataflow(dsl:str, tenant_id:str, doc_id:str, task_id:str, flow_id:str, callback=None):
+    _ = callback

-    if task["task_type"] == "dataflow":
-        e, cvs = UserCanvasService.get_by_id(dataflow_id)
-        assert e, "User pipeline not found."
-        dsl = cvs.dsl
-    else:
-        e, pipeline_log = PipelineOperationLogService.get_by_id(dataflow_id)
-        assert e, "Pipeline log not found."
-        dsl = pipeline_log.dsl
-        dataflow_id = pipeline_log.pipeline_id
-    pipeline = Pipeline(dsl, tenant_id=task["tenant_id"], doc_id=doc_id, task_id=task_id, flow_id=dataflow_id)
-    chunks = await pipeline.run(file=task["file"]) if task.get("file") else await pipeline.run()
-    if doc_id == CANVAS_DEBUG_DOC_ID:
-        return
+    pipeline = Pipeline(dsl=dsl, tenant_id=tenant_id, doc_id=doc_id, task_id=task_id, flow_id=flow_id)
+    pipeline.reset()

-    if not chunks:
-        PipelineOperationLogService.create(document_id=doc_id, pipeline_id=dataflow_id, task_type=PipelineTaskType.PARSE, dsl=str(pipeline))
-        return
-
-    embedding_token_consumption = chunks.get("embedding_token_consumption", 0)
-    if chunks.get("chunks"):
-        chunks = copy.deepcopy(chunks["chunks"])
-    elif chunks.get("json"):
-        chunks = copy.deepcopy(chunks["json"])
-    elif chunks.get("markdown"):
-        chunks = [{"text": [chunks["markdown"]]}]
-    elif chunks.get("text"):
-        chunks = [{"text": [chunks["text"]]}]
-    elif chunks.get("html"):
-        chunks = [{"text": [chunks["html"]]}]
-
-    keys = [k for o in chunks for k in list(o.keys())]
-    if not any([re.match(r"q_[0-9]+_vec", k) for k in keys]):
-        try:
-            set_progress(task_id, prog=0.82, msg="\n-------------------------------------\nStart to embedding...")
-            e, kb = KnowledgebaseService.get_by_id(task["kb_id"])
-            embedding_id = kb.embd_id
-            embedding_model = LLMBundle(task["tenant_id"], LLMType.EMBEDDING, llm_name=embedding_id)
-            @timeout(60)
-            def batch_encode(txts):
-                nonlocal embedding_model
-                return embedding_model.encode([truncate(c, embedding_model.max_length - 10) for c in txts])
-            vects = np.array([])
-            texts = [o.get("questions", o.get("summary", o["text"])) for o in chunks]
-            delta = 0.20/(len(texts)//EMBEDDING_BATCH_SIZE+1)
-            prog = 0.8
-            for i in range(0, len(texts), EMBEDDING_BATCH_SIZE):
-                async with embed_limiter:
-                    vts, c = await trio.to_thread.run_sync(lambda: batch_encode(texts[i : i + EMBEDDING_BATCH_SIZE]))
-                if len(vects) == 0:
-                    vects = vts
-                else:
-                    vects = np.concatenate((vects, vts), axis=0)
-                embedding_token_consumption += c
-                prog += delta
-                if i % (len(texts)//EMBEDDING_BATCH_SIZE/100+1) == 1:
-                    set_progress(task_id, prog=prog, msg=f"{i+1} / {len(texts)//EMBEDDING_BATCH_SIZE}")
-
-            assert len(vects) == len(chunks)
-            for i, ck in enumerate(chunks):
-                v = vects[i].tolist()
-                ck["q_%d_vec" % len(v)] = v
-        except Exception as e:
-            set_progress(task_id, prog=-1, msg=f"[ERROR]: {e}")
-            PipelineOperationLogService.create(document_id=doc_id, pipeline_id=dataflow_id, task_type=PipelineTaskType.PARSE, dsl=str(pipeline))
-            return
-
-
-    metadata = {}
-    def dict_update(meta):
-        nonlocal metadata
-        if not meta:
-            return
-        if isinstance(meta, str):
-            try:
-                meta = json_repair.loads(meta)
-            except Exception:
-                logging.error("Meta data format error.")
-                return
-        if not isinstance(meta, dict):
-            return
-        for k, v in meta.items():
-            if isinstance(v, list):
-                v = [vv for vv in v if isinstance(vv, str)]
-                if not v:
-                    continue
-            if not isinstance(v, list) and not isinstance(v, str):
-                continue
-            if k not in metadata:
-                metadata[k] = v
-                continue
-            if isinstance(metadata[k], list):
-                if isinstance(v, list):
-                    metadata[k].extend(v)
-                else:
-                    metadata[k].append(v)
-            else:
-                metadata[k] = v
-
-    for ck in chunks:
-        ck["doc_id"] = doc_id
-        ck["kb_id"] = [str(task["kb_id"])]
-        ck["docnm_kwd"] = task["name"]
-        ck["create_time"] = str(datetime.now()).replace("T", " ")[:19]
-        ck["create_timestamp_flt"] = datetime.now().timestamp()
-        ck["id"] = xxhash.xxh64((ck["text"] + str(ck["doc_id"])).encode("utf-8")).hexdigest()
-        if "questions" in ck:
-            if "question_tks" not in ck:
-                ck["question_kwd"] = ck["questions"].split("\n")
-                ck["question_tks"] = rag_tokenizer.tokenize(str(ck["questions"]))
-            del ck["questions"]
-        if "keywords" in ck:
-            if "important_tks" not in ck:
-                ck["important_kwd"] = ck["keywords"].split(",")
-                ck["important_tks"] = rag_tokenizer.tokenize(str(ck["keywords"]))
-            del ck["keywords"]
-        if "summary" in ck:
-            if "content_ltks" not in ck:
-                ck["content_ltks"] = rag_tokenizer.tokenize(str(ck["summary"]))
-                ck["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(ck["content_ltks"])
-            del ck["summary"]
-        if "metadata" in ck:
-            dict_update(ck["metadata"])
-            del ck["metadata"]
-        if "content_with_weight" not in ck:
-            ck["content_with_weight"] = ck["text"]
-        del ck["text"]
-        if "positions" in ck:
-            add_positions(ck, ck["positions"])
-            del ck["positions"]
-
-    if metadata:
-        e, doc = DocumentService.get_by_id(doc_id)
-        if e:
-            if isinstance(doc.meta_fields, str):
-                doc.meta_fields = json.loads(doc.meta_fields)
-            dict_update(doc.meta_fields)
-            DocumentService.update_by_id(doc_id, {"meta_fields": metadata})
-
-    start_ts = timer()
-    set_progress(task_id, prog=0.82, msg="[DOC Engine]:\nStart to index...")
-    e = await insert_es(task_id, task["tenant_id"], task["kb_id"], chunks, partial(set_progress, task_id, 0, 100000000))
-    if not e:
-        PipelineOperationLogService.create(document_id=doc_id, pipeline_id=dataflow_id, task_type=PipelineTaskType.PARSE, dsl=str(pipeline))
-        return
-
-    time_cost = timer() - start_ts
-    task_time_cost = timer() - task_start_ts
-    set_progress(task_id, prog=1., msg="Indexing done ({:.2f}s). Task done ({:.2f}s)".format(time_cost, task_time_cost))
-    DocumentService.increment_chunk_num(doc_id, task_dataset_id, embedding_token_consumption, len(chunks), task_time_cost)
-    logging.info("[Done], chunks({}), token({}), elapsed:{:.2f}".format(len(chunks),  embedding_token_consumption, task_time_cost))
-    PipelineOperationLogService.create(document_id=doc_id, pipeline_id=dataflow_id, task_type=PipelineTaskType.PARSE, dsl=str(pipeline))
+    await pipeline.run()


@timeout(3600)
-async def run_raptor_for_kb(row, kb_parser_config, chat_mdl, embd_mdl, vector_size, callback=None, doc_ids=[]):
-    fake_doc_id = GRAPH_RAPTOR_FAKE_DOC_ID
-
-    raptor_config = kb_parser_config.get("raptor", {})
-
+async def run_raptor(row, chat_mdl, embd_mdl, vector_size, callback=None):
    chunks = []
    vctr_nm = "q_%d_vec"%vector_size
-    for doc_id in doc_ids:
-        for d in settings.retrievaler.chunk_list(doc_id, row["tenant_id"], [str(row["kb_id"])],
-                                                 fields=["content_with_weight", vctr_nm],
-                                                 sort_by_position=True):
-            chunks.append((d["content_with_weight"], np.array(d[vctr_nm])))
+    for d in settings.retrievaler.chunk_list(row["doc_id"], row["tenant_id"], [str(row["kb_id"])],
+                                             fields=["content_with_weight", vctr_nm]):
+        chunks.append((d["content_with_weight"], np.array(d[vctr_nm])))

    raptor = Raptor(
-        raptor_config.get("max_cluster", 64),
+        row["parser_config"]["raptor"].get("max_cluster", 64),
        chat_mdl,
        embd_mdl,
-        raptor_config["prompt"],
-        raptor_config["max_token"],
-        raptor_config["threshold"],
+        row["parser_config"]["raptor"]["prompt"],
+        row["parser_config"]["raptor"]["max_token"],
+        row["parser_config"]["raptor"]["threshold"]
    )
    original_length = len(chunks)
-    chunks = await raptor(chunks, row["kb_parser_config"]["raptor"]["random_seed"], callback)
+    chunks = await raptor(chunks, row["parser_config"]["raptor"]["random_seed"], callback)
    doc = {
-        "doc_id": fake_doc_id,
+        "doc_id": row["doc_id"],
        "kb_id": [str(row["kb_id"])],
        "docnm_kwd": row["name"],
        "title_tks": rag_tokenizer.tokenize(row["name"])
@ -672,7 +521,7 @@ async def run_raptor_for_kb(row, kb_parser_config, chat_mdl, embd_mdl, vector_si
    tk_count = 0
    for content, vctr in chunks[original_length:]:
        d = copy.deepcopy(doc)
-        d["id"] = xxhash.xxh64((content + str(fake_doc_id)).encode("utf-8")).hexdigest()
+        d["id"] = xxhash.xxh64((content + str(d["doc_id"])).encode("utf-8")).hexdigest()
        d["create_time"] = str(datetime.now()).replace("T", " ")[:19]
        d["create_timestamp_flt"] = datetime.now().timestamp()
        d[vctr_nm] = vctr.tolist()
@ -684,51 +533,8 @@ async def run_raptor_for_kb(row, kb_parser_config, chat_mdl, embd_mdl, vector_si
    return res, tk_count


-async def delete_image(kb_id, chunk_id):
-    try:
-        async with minio_limiter:
-            STORAGE_IMPL.delete(kb_id, chunk_id)
-    except Exception:
-        logging.exception(f"Deleting image of chunk {chunk_id} got exception")
-        raise
-
-
-async def insert_es(task_id, task_tenant_id, task_dataset_id, chunks, progress_callback):
-    for b in range(0, len(chunks), DOC_BULK_SIZE):
-        doc_store_result = await trio.to_thread.run_sync(lambda: settings.docStoreConn.insert(chunks[b:b + DOC_BULK_SIZE], search.index_name(task_tenant_id), task_dataset_id))
-        task_canceled = has_canceled(task_id)
-        if task_canceled:
-            progress_callback(-1, msg="Task has been canceled.")
-            return
-        if b % 128 == 0:
-            progress_callback(prog=0.8 + 0.1 * (b + 1) / len(chunks), msg="")
-        if doc_store_result:
-            error_message = f"Insert chunk error: {doc_store_result}, please check log file and Elasticsearch/Infinity status!"
-            progress_callback(-1, msg=error_message)
-            raise Exception(error_message)
-        chunk_ids = [chunk["id"] for chunk in chunks[:b + DOC_BULK_SIZE]]
-        chunk_ids_str = " ".join(chunk_ids)
-        try:
-            TaskService.update_chunk_ids(task_id, chunk_ids_str)
-        except DoesNotExist:
-            logging.warning(f"do_handle_task update_chunk_ids failed since task {task_id} is unknown.")
-            doc_store_result = await trio.to_thread.run_sync(lambda: settings.docStoreConn.delete({"id": chunk_ids}, search.index_name(task_tenant_id), task_dataset_id))
-            async with trio.open_nursery() as nursery:
-                for chunk_id in chunk_ids:
-                    nursery.start_soon(delete_image, task_dataset_id, chunk_id)
-            progress_callback(-1, msg=f"Chunk updates failed since task {task_id} is unknown.")
-            return
-    return True
-
-
@timeout(60*60*2, 1)
 async def do_handle_task(task):
-    task_type = task.get("task_type", "")
-
-    if task_type == "dataflow" and task.get("doc_id", "") == CANVAS_DEBUG_DOC_ID:
-        await run_dataflow(task)
-        return
-
    task_id = task["id"]
    task_from_page = task["from_page"]
    task_to_page = task["to_page"]
@ -770,70 +576,32 @@ async def do_handle_task(task):

    init_kb(task, vector_size)

-    if task_type[:len("dataflow")] == "dataflow":
-        await run_dataflow(task)
+    task_type = task.get("task_type", "")
+    if task_type == "dataflow":
+        task_dataflow_dsl = task["dsl"]
+        task_dataflow_id = task["dataflow_id"]
+        await run_dataflow(dsl=task_dataflow_dsl, tenant_id=task_tenant_id, doc_id=task_doc_id, task_id=task_id, flow_id=task_dataflow_id, callback=None)
        return
-
-    if task_type == "raptor":
-        ok, kb = KnowledgebaseService.get_by_id(task_dataset_id)
-        if not ok:
-            progress_callback(prog=-1.0, msg="Cannot found valid knowledgebase for RAPTOR task")
-            return
-
-        kb_parser_config = kb.parser_config
-        if not kb_parser_config.get("raptor", {}).get("use_raptor", False):
-            progress_callback(prog=-1.0, msg="Internal error: Invalid RAPTOR configuration")
-            return
+    elif task_type == "raptor":
        # bind LLM for raptor
        chat_model = LLMBundle(task_tenant_id, LLMType.CHAT, llm_name=task_llm_id, lang=task_language)
        # run RAPTOR
        async with kg_limiter:
-            chunks, token_count = await run_raptor_for_kb(
-                row=task,
-                kb_parser_config=kb_parser_config,
-                chat_mdl=chat_model,
-                embd_mdl=embedding_model,
-                vector_size=vector_size,
-                callback=progress_callback,
-                doc_ids=task.get("doc_ids", []),
-            )
+            chunks, token_count = await run_raptor(task, chat_model, embedding_model, vector_size, progress_callback)
    # Either using graphrag or Standard chunking methods
    elif task_type == "graphrag":
-        ok, kb = KnowledgebaseService.get_by_id(task_dataset_id)
-        if not ok:
-            progress_callback(prog=-1.0, msg="Cannot found valid knowledgebase for GraphRAG task")
+        if not task_parser_config.get("graphrag", {}).get("use_graphrag", False):
+            progress_callback(prog=-1.0, msg="Internal configuration error.")
            return
-
-        kb_parser_config = kb.parser_config
-        if not kb_parser_config.get("graphrag", {}).get("use_graphrag", False):
-            progress_callback(prog=-1.0, msg="Internal error: Invalid GraphRAG configuration")
-            return
-
-        graphrag_conf = kb_parser_config.get("graphrag", {})
+        graphrag_conf = task["kb_parser_config"].get("graphrag", {})
        start_ts = timer()
        chat_model = LLMBundle(task_tenant_id, LLMType.CHAT, llm_name=task_llm_id, lang=task_language)
        with_resolution = graphrag_conf.get("resolution", False)
        with_community = graphrag_conf.get("community", False)
        async with kg_limiter:
-            # await run_graphrag(task, task_language, with_resolution, with_community, chat_model, embedding_model, progress_callback)
-            result = await run_graphrag_for_kb(
-                row=task,
-                doc_ids=task.get("doc_ids", []),
-                language=task_language,
-                kb_parser_config=kb_parser_config,
-                chat_model=chat_model,
-                embedding_model=embedding_model,
-                callback=progress_callback,
-                with_resolution=with_resolution,
-                with_community=with_community,
-            )
-            logging.info(f"GraphRAG task result for task {task}:\n{result}")
+            await run_graphrag(task, task_language, with_resolution, with_community, chat_model, embedding_model, progress_callback)
        progress_callback(prog=1.0, msg="Knowledge Graph done ({:.2f}s)".format(timer() - start_ts))
        return
-    elif task_type == "mindmap":
-        progress_callback(1, "place holder")
-        pass
-        return
    else:
        # Standard chunking methods
        start_ts = timer()
@ -860,9 +628,41 @@ async def do_handle_task(task):

    chunk_count = len(set([chunk["id"] for chunk in chunks]))
    start_ts = timer()
-    e = await insert_es(task_id, task_tenant_id, task_dataset_id, chunks, progress_callback)
-    if not e:
-        return
+    doc_store_result = ""
+
+    async def delete_image(kb_id, chunk_id):
+        try:
+            async with minio_limiter:
+                STORAGE_IMPL.delete(kb_id, chunk_id)
+        except Exception:
+            logging.exception(
+                "Deleting image of chunk {}/{}/{} got exception".format(task["location"], task["name"], chunk_id))
+            raise
+
+    for b in range(0, len(chunks), DOC_BULK_SIZE):
+        doc_store_result = await trio.to_thread.run_sync(lambda: settings.docStoreConn.insert(chunks[b:b + DOC_BULK_SIZE], search.index_name(task_tenant_id), task_dataset_id))
+        task_canceled = has_canceled(task_id)
+        if task_canceled:
+            progress_callback(-1, msg="Task has been canceled.")
+            return
+        if b % 128 == 0:
+            progress_callback(prog=0.8 + 0.1 * (b + 1) / len(chunks), msg="")
+        if doc_store_result:
+            error_message = f"Insert chunk error: {doc_store_result}, please check log file and Elasticsearch/Infinity status!"
+            progress_callback(-1, msg=error_message)
+            raise Exception(error_message)
+        chunk_ids = [chunk["id"] for chunk in chunks[:b + DOC_BULK_SIZE]]
+        chunk_ids_str = " ".join(chunk_ids)
+        try:
+            TaskService.update_chunk_ids(task["id"], chunk_ids_str)
+        except DoesNotExist:
+            logging.warning(f"do_handle_task update_chunk_ids failed since task {task['id']} is unknown.")
+            doc_store_result = await trio.to_thread.run_sync(lambda: settings.docStoreConn.delete({"id": chunk_ids}, search.index_name(task_tenant_id), task_dataset_id))
+            async with trio.open_nursery() as nursery:
+                for chunk_id in chunk_ids:
+                    nursery.start_soon(delete_image, task_dataset_id, chunk_id)
+            progress_callback(-1, msg=f"Chunk updates failed since task {task['id']} is unknown.")
+            return

    logging.info("Indexing doc({}), page({}-{}), chunks({}), elapsed: {:.2f}".format(task_document_name, task_from_page,
                                                                                     task_to_page, len(chunks),
@ -885,10 +685,6 @@ async def handle_task():
    if not task:
        await trio.sleep(5)
        return
-
-    task_type = task["task_type"]
-    pipeline_task_type = TASK_TYPE_TO_PIPELINE_TASK_TYPE.get(task_type, PipelineTaskType.PARSE) or PipelineTaskType.PARSE
-
    try:
        logging.info(f"handle_task begin for task {json.dumps(task)}")
        CURRENT_TASKS[task["id"]] = copy.deepcopy(task)
@ -908,13 +704,6 @@ async def handle_task():
        except Exception:
            pass
        logging.exception(f"handle_task got exception for task {json.dumps(task)}")
-    finally:
-        task_document_ids = []
-        if task_type in ["graphrag", "raptor", "mindmap"]:
-            task_document_ids = task["doc_ids"]
-        if not task.get("dataflow_id", ""):
-            PipelineOperationLogService.record_pipeline_operation(document_id=task["doc_id"], pipeline_id="", task_type=pipeline_task_type, fake_document_ids=task_document_ids)
-
    redis_msg.ack()


--- a/rag/utils/minio_conn.py
+++ b/rag/utils/minio_conn.py
@ -108,19 +108,6 @@ class RAGFlowMinio:
            logging.exception(f"obj_exist {bucket}/{filename} got exception")
            return False

-    def bucket_exists(self, bucket):
-        try:
-            if not self.conn.bucket_exists(bucket):
-                return False
-            else:
-                return True
-        except S3Error as e:
-            if e.code in ["NoSuchKey", "NoSuchBucket", "ResourceNotFound"]:
-                return False
-        except Exception:
-            logging.exception(f"bucket_exist {bucket} got exception")
-            return False
-
    def get_presigned_url(self, bucket, fnm, expires):
        for _ in range(10):
            try:
--- a/uv.lock
+++ b/uv.lock
@ -1,5 +1,4 @@
 version = 1
-revision = 3
 requires-python = ">=3.10, <3.13"
 resolution-markers = [
    "python_full_version >= '3.12' and sys_platform == 'darwin'",
@ -5564,7 +5563,6 @@ requires-dist = [
    { name = "yfinance", specifier = "==0.2.65" },
    { name = "zhipuai", specifier = "==2.0.1" },
 ]
-provides-extras = ["full"]

 [package.metadata.requires-dev]
 test = [
--- a/web/public/iconfont.js
+++ b/web/public/iconfont.js
--- a/web/src/assets/svg/data-flow/data-icon-bri.svg
+++ b/web/src/assets/svg/data-flow/data-icon-bri.svg
@ -1,15 +0,0 @@
-<svg width="40" height="40" viewBox="0 0 40 40" fill="none" xmlns="http://www.w3.org/2000/svg">
-<path d="M35.3194 10.6367H20.4258C19.4857 10.6367 18.7236 11.3988 18.7236 12.3388V34.892C18.7236 35.8321 19.4857 36.5942 20.4258 36.5942H35.3194C36.2594 36.5942 37.0215 35.8321 37.0215 34.892V12.3388C37.0215 11.3988 36.2594 10.6367 35.3194 10.6367Z" fill="url(#paint0_linear_488_37636)"/>
-<path d="M31.0639 4.25391H5.10642C4.16637 4.25391 3.4043 5.01597 3.4043 5.95603V18.2965C3.4043 19.2365 4.16637 19.9986 5.10642 19.9986H31.0639C32.0039 19.9986 32.766 19.2365 32.766 18.2965V5.95603C32.766 5.01597 32.0039 4.25391 31.0639 4.25391Z" fill="#00BEB4" fill-opacity="0.1"/>
-<path d="M31.0639 4.25391C32.0039 4.25391 32.766 5.01597 32.766 5.95603V18.2965C32.766 19.2365 32.0039 19.9986 31.0639 19.9986H5.10642C4.16637 19.9986 3.4043 19.2365 3.4043 18.2965V5.95603C3.4043 5.01597 4.16637 4.25391 5.10642 4.25391H31.0639ZM31.0639 4.67944H5.10642C4.40138 4.67944 3.82983 5.25099 3.82983 5.95603V18.2965C3.82983 19.0015 4.40138 19.5731 5.10642 19.5731H31.0639C31.7689 19.5731 32.3405 19.0015 32.3405 18.2965V5.95603C32.3405 5.25099 31.7689 4.67944 31.0639 4.67944Z" fill="#00BEB4"/>
-<path d="M31.0639 22.5547H5.10642C4.16637 22.5547 3.4043 23.3168 3.4043 24.2568V34.8951C3.4043 35.8352 4.16637 36.5972 5.10642 36.5972H31.0639C32.0039 36.5972 32.766 35.8352 32.766 34.8951V24.2568C32.766 23.3168 32.0039 22.5547 31.0639 22.5547Z" fill="#00BEB4" fill-opacity="0.1"/>
-<path d="M31.0639 22.5547C32.0039 22.5547 32.766 23.3168 32.766 24.2568V34.8951C32.766 35.8352 32.0039 36.5972 31.0639 36.5972H5.10642C4.16637 36.5972 3.4043 35.8352 3.4043 34.8951V24.2568C3.4043 23.3168 4.16637 22.5547 5.10642 22.5547H31.0639ZM31.0639 22.9802H5.10642C4.40138 22.9802 3.82983 23.5518 3.82983 24.2568V34.8951C3.82983 35.6002 4.40138 36.1717 5.10642 36.1717H31.0639C31.7689 36.1717 32.3405 35.6002 32.3405 34.8951V24.2568C32.3405 23.5518 31.7689 22.9802 31.0639 22.9802Z" fill="#00BEB4"/>
-<path d="M10.6384 14.8949C12.2835 14.8949 13.6171 13.5613 13.6171 11.9162C13.6171 10.2711 12.2835 8.9375 10.6384 8.9375C8.99329 8.9375 7.65967 10.2711 7.65967 11.9162C7.65967 13.5613 8.99329 14.8949 10.6384 14.8949Z" fill="#00BEB4"/>
-<path d="M10.6384 32.766C12.2835 32.766 13.6171 31.4324 13.6171 29.7873C13.6171 28.1422 12.2835 26.8086 10.6384 26.8086C8.99329 26.8086 7.65967 28.1422 7.65967 29.7873C7.65967 31.4324 8.99329 32.766 10.6384 32.766Z" fill="#00BEB4"/>
-<defs>
-<linearGradient id="paint0_linear_488_37636" x1="933.617" y1="10.6367" x2="933.617" y2="2606.38" gradientUnits="userSpaceOnUse">
-<stop stop-color="#C9F1EF"/>
-<stop offset="1" stop-color="#00BEB4"/>
-</linearGradient>
-</defs>
-</svg>
--- a/web/src/assets/svg/data-flow/data-icon.svg
+++ b/web/src/assets/svg/data-flow/data-icon.svg
@ -1,15 +0,0 @@
-<svg width="40" height="40" viewBox="0 0 40 40" fill="none" xmlns="http://www.w3.org/2000/svg">
-<path d="M35.3194 10.6387H20.4258C19.4857 10.6387 18.7236 11.4007 18.7236 12.3408V34.894C18.7236 35.834 19.4857 36.5961 20.4258 36.5961H35.3194C36.2594 36.5961 37.0215 35.834 37.0215 34.894V12.3408C37.0215 11.4007 36.2594 10.6387 35.3194 10.6387Z" fill="url(#paint0_linear_491_41413)"/>
-<path d="M31.0639 4.25586H5.10642C4.16637 4.25586 3.4043 5.01793 3.4043 5.95799V18.2984C3.4043 19.2385 4.16637 20.0005 5.10642 20.0005H31.0639C32.0039 20.0005 32.766 19.2385 32.766 18.2984V5.95799C32.766 5.01793 32.0039 4.25586 31.0639 4.25586Z" fill="#00BEB4" fill-opacity="0.2"/>
-<path d="M31.0639 4.25586C32.0039 4.25586 32.766 5.01793 32.766 5.95799V18.2984C32.766 19.2385 32.0039 20.0005 31.0639 20.0005H5.10642C4.16637 20.0005 3.4043 19.2385 3.4043 18.2984V5.95799C3.4043 5.01793 4.16637 4.25586 5.10642 4.25586H31.0639ZM31.0639 4.68139H5.10642C4.40138 4.68139 3.82983 5.25294 3.82983 5.95799V18.2984C3.82983 19.0035 4.40138 19.575 5.10642 19.575H31.0639C31.7689 19.575 32.3405 19.0035 32.3405 18.2984V5.95799C32.3405 5.25294 31.7689 4.68139 31.0639 4.68139Z" fill="#226365"/>
-<path d="M31.0639 22.5527H5.10642C4.16637 22.5527 3.4043 23.3148 3.4043 24.2549V34.8932C3.4043 35.8332 4.16637 36.5953 5.10642 36.5953H31.0639C32.0039 36.5953 32.766 35.8332 32.766 34.8932V24.2549C32.766 23.3148 32.0039 22.5527 31.0639 22.5527Z" fill="#3A9093" fill-opacity="0.2"/>
-<path d="M31.0639 22.5527C32.0039 22.5527 32.766 23.3148 32.766 24.2549V34.8932C32.766 35.8332 32.0039 36.5953 31.0639 36.5953H5.10642C4.16637 36.5953 3.4043 35.8332 3.4043 34.8932V24.2549C3.4043 23.3148 4.16637 22.5527 5.10642 22.5527H31.0639ZM31.0639 22.9783H5.10642C4.40138 22.9783 3.82983 23.5498 3.82983 24.2549V34.8932C3.82983 35.5982 4.40138 36.1698 5.10642 36.1698H31.0639C31.7689 36.1698 32.3405 35.5982 32.3405 34.8932V24.2549C32.3405 23.5498 31.7689 22.9783 31.0639 22.9783Z" fill="#226365"/>
-<path d="M10.6384 14.893C12.2835 14.893 13.6171 13.5594 13.6171 11.9143C13.6171 10.2692 12.2835 8.93555 10.6384 8.93555C8.99329 8.93555 7.65967 10.2692 7.65967 11.9143C7.65967 13.5594 8.99329 14.893 10.6384 14.893Z" fill="#3A9093"/>
-<path d="M10.6384 32.766C12.2835 32.766 13.6171 31.4324 13.6171 29.7873C13.6171 28.1422 12.2835 26.8086 10.6384 26.8086C8.99329 26.8086 7.65967 28.1422 7.65967 29.7873C7.65967 31.4324 8.99329 32.766 10.6384 32.766Z" fill="#3A9093"/>
-<defs>
-<linearGradient id="paint0_linear_491_41413" x1="933.617" y1="10.6387" x2="933.617" y2="2606.38" gradientUnits="userSpaceOnUse">
-<stop stop-color="#1B3C3D"/>
-<stop offset="1" stop-color="#164142"/>
-</linearGradient>
-</defs>
-</svg>
--- a/web/src/assets/svg/data-flow/knowledgegraph.svg
+++ b/web/src/assets/svg/data-flow/knowledgegraph.svg
@ -0,0 +1 @@
+<?xml version="1.0" standalone="no"?><!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd"><svg t="1756884949583" class="icon" viewBox="0 0 1024 1024" version="1.1" xmlns="http://www.w3.org/2000/svg" p-id="11332" xmlns:xlink="http://www.w3.org/1999/xlink" width="200" height="200"><path d="M190.464 489.472h327.68v40.96h-327.68z" fill="#C7DCFE" p-id="11333"></path><path d="M482.34496 516.5056l111.26784-308.20352 38.54336 13.9264L520.86784 530.432z" fill="#C7DCFE" p-id="11334"></path><path d="M620.544 196.608m-122.88 0a122.88 122.88 0 1 0 245.76 0 122.88 122.88 0 1 0-245.76 0Z" fill="#8FB8FC" p-id="11335"></path><path d="M182.272 509.952m-122.88 0a122.88 122.88 0 1 0 245.76 0 122.88 122.88 0 1 0-245.76 0Z" fill="#C7DCFE" p-id="11336"></path><path d="M558.65344 520.9088l283.77088 163.84-20.48 35.47136-283.77088-163.84z" fill="#C7DCFE" p-id="11337"></path><path d="M841.728 686.08m-122.88 0a122.88 122.88 0 1 0 245.76 0 122.88 122.88 0 1 0-245.76 0Z" fill="#B3CEFE" p-id="11338"></path><path d="M448.67584 803.77856l49.60256-323.91168 40.48896 6.20544-49.60256 323.91168z" fill="#C7DCFE" p-id="11339"></path><path d="M512 530.432m-143.36 0a143.36 143.36 0 1 0 286.72 0 143.36 143.36 0 1 0-286.72 0Z" fill="#4185FF" p-id="11340"></path><path d="M462.848 843.776m-102.4 0a102.4 102.4 0 1 0 204.8 0 102.4 102.4 0 1 0-204.8 0Z" fill="#8FB8FC" p-id="11341"></path></svg>
--- a/web/src/assets/svg/data-flow/processing-icon-bri.svg
+++ b/web/src/assets/svg/data-flow/processing-icon-bri.svg
@ -1,6 +0,0 @@
-<svg width="40" height="40" viewBox="0 0 40 40" fill="none" xmlns="http://www.w3.org/2000/svg">
-<path fill-rule="evenodd" clip-rule="evenodd" d="M21.8074 21.9283L30.4051 33.9033C30.9531 34.667 30.7785 35.7307 30.0148 36.2787C29.7258 36.4865 29.3785 36.5982 29.0223 36.5982H11.8273C10.8871 36.5982 10.125 35.8361 10.125 34.8963C10.125 34.54 10.2367 34.1928 10.4445 33.9033L19.0422 21.9283C19.5902 21.1646 20.6539 20.99 21.4176 21.5385C21.5676 21.6463 21.6996 21.7779 21.8074 21.9283Z" fill="#C6EFED"/>
-<path fill-rule="evenodd" clip-rule="evenodd" d="M5.94336 3.39844H34.0285C35.9086 3.39844 37.4328 4.92266 37.4328 6.80273V27.2281C37.4328 29.1082 35.9086 30.6324 34.0285 30.6324H5.94336C4.06328 30.6324 2.53906 29.1082 2.53906 27.2281V6.80273C2.53906 4.92266 4.06328 3.39844 5.94336 3.39844Z" fill="#00BEB4" fill-opacity="0.2"/>
-<path d="M34.0422 3.40625C35.9223 3.40625 37.4465 4.93047 37.4465 6.81055V27.2359C37.4465 29.116 35.9223 30.6402 34.0422 30.6402H5.95703C4.07695 30.6402 2.55273 29.116 2.55273 27.2359V6.81055C2.55273 4.93047 4.07695 3.40625 5.95703 3.40625H34.0422ZM34.0422 3.83164H5.95703C4.31211 3.83164 2.97852 5.16523 2.97852 6.81055V27.2359C2.97852 28.8812 4.31211 30.2148 5.95703 30.2148H34.0422C35.6871 30.2148 37.0207 28.8812 37.0207 27.2359V6.81055C37.0207 5.16523 35.6871 3.83164 34.0422 3.83164Z" fill="#00BEB4"/>
-<path fill-rule="evenodd" clip-rule="evenodd" d="M19.9785 11.6797C20.6836 11.6797 21.2551 12.2512 21.2551 12.9562V21.0414C21.2551 21.7465 20.6836 22.318 19.9785 22.318C19.2734 22.318 18.702 21.7465 18.702 21.0414V12.9562C18.702 12.2512 19.2734 11.6797 19.9785 11.6797ZM11.0422 11.6797C11.7473 11.6797 12.3187 12.2512 12.3187 12.9562V21.0414C12.3187 21.7465 11.7473 22.318 11.0422 22.318C10.3371 22.318 9.76562 21.7465 9.76562 21.0414V12.9562C9.76562 12.2512 10.3371 11.6797 11.0422 11.6797ZM28.9145 11.6797C29.6195 11.6797 30.191 12.2512 30.191 12.9562V21.0414C30.191 21.7465 29.6195 22.318 28.9145 22.318C28.2094 22.318 27.6379 21.7465 27.6379 21.0414V12.9562C27.6379 12.2512 28.2094 11.6797 28.9145 11.6797Z" fill="#00BEB4"/>
-</svg>
--- a/web/src/assets/svg/data-flow/processing-icon.svg
+++ b/web/src/assets/svg/data-flow/processing-icon.svg
@ -1,6 +0,0 @@
-<svg width="40" height="40" viewBox="0 0 40 40" fill="none" xmlns="http://www.w3.org/2000/svg">
-<path fill-rule="evenodd" clip-rule="evenodd" d="M21.8074 21.9264L30.4051 33.9014C30.9531 34.665 30.7785 35.7287 30.0148 36.2767C29.7258 36.4846 29.3785 36.5963 29.0223 36.5963H11.8273C10.8871 36.5963 10.125 35.8342 10.125 34.8943C10.125 34.5381 10.2367 34.1908 10.4445 33.9014L19.0422 21.9264C19.5902 21.1627 20.6539 20.9881 21.4176 21.5365C21.5676 21.6443 21.6996 21.776 21.8074 21.9264Z" fill="#1C3C3D"/>
-<path fill-rule="evenodd" clip-rule="evenodd" d="M5.94336 3.39844H34.0285C35.9086 3.39844 37.4328 4.92266 37.4328 6.80273V27.2281C37.4328 29.1082 35.9086 30.6324 34.0285 30.6324H5.94336C4.06328 30.6324 2.53906 29.1082 2.53906 27.2281V6.80273C2.53906 4.92266 4.06328 3.39844 5.94336 3.39844Z" fill="#00BEB4" fill-opacity="0.2"/>
-<path d="M34.0422 3.4043C35.9223 3.4043 37.4465 4.92852 37.4465 6.80859V27.234C37.4465 29.1141 35.9223 30.6383 34.0422 30.6383H5.95703C4.07695 30.6383 2.55273 29.1141 2.55273 27.234V6.80859C2.55273 4.92852 4.07695 3.4043 5.95703 3.4043H34.0422ZM34.0422 3.82969H5.95703C4.31211 3.82969 2.97852 5.16328 2.97852 6.80859V27.234C2.97852 28.8793 4.31211 30.2129 5.95703 30.2129H34.0422C35.6871 30.2129 37.0207 28.8793 37.0207 27.234V6.80859C37.0207 5.16328 35.6871 3.82969 34.0422 3.82969Z" fill="#1B3B3C"/>
-<path fill-rule="evenodd" clip-rule="evenodd" d="M19.9785 11.6797C20.6836 11.6797 21.2551 12.2512 21.2551 12.9562V21.0414C21.2551 21.7465 20.6836 22.318 19.9785 22.318C19.2734 22.318 18.702 21.7465 18.702 21.0414V12.9562C18.702 12.2512 19.2734 11.6797 19.9785 11.6797ZM11.0422 11.6797C11.7473 11.6797 12.3187 12.2512 12.3187 12.9562V21.0414C12.3187 21.7465 11.7473 22.318 11.0422 22.318C10.3371 22.318 9.76562 21.7465 9.76562 21.0414V12.9562C9.76562 12.2512 10.3371 11.6797 11.0422 11.6797ZM28.9145 11.6797C29.6195 11.6797 30.191 12.2512 30.191 12.9562V21.0414C30.191 21.7465 29.6195 22.318 28.9145 22.318C28.2094 22.318 27.6379 21.7465 27.6379 21.0414V12.9562C27.6379 12.2512 28.2094 11.6797 28.9145 11.6797Z" fill="#00BEB4"/>
-</svg>
--- a/web/src/assets/svg/data-flow/raptor.svg
+++ b/web/src/assets/svg/data-flow/raptor.svg
--- a/web/src/assets/svg/data-flow/total-files-icon-bri.svg
+++ b/web/src/assets/svg/data-flow/total-files-icon-bri.svg
@ -1,6 +0,0 @@
-<svg width="40" height="40" viewBox="0 0 40 40" fill="none" xmlns="http://www.w3.org/2000/svg">
-<path fill-rule="evenodd" clip-rule="evenodd" d="M11.0291 4.67969C11.8025 4.67969 12.4787 5.20078 12.6752 5.94844L13.3494 8.50937H31.4275C33.1599 8.50937 34.6158 9.81055 34.8103 11.5316L37.0205 31.1062C37.231 32.9746 35.8877 34.6602 34.0193 34.8711C33.8927 34.8852 33.765 34.8926 33.6377 34.8926H6.30289C4.92476 34.8926 3.79547 33.7988 3.75094 32.4215L3.11734 12.7746H3.115L2.90719 6.4375C2.87633 5.49805 3.61304 4.71133 4.5525 4.68047C4.57086 4.68008 4.58961 4.67969 4.60836 4.67969H11.0291Z" fill="#00BEB4" fill-opacity="0.1"/>
-<path d="M11.0291 4.67969C11.8025 4.67969 12.4787 5.20078 12.6752 5.94844L13.349 8.50937H31.4275C33.1599 8.50937 34.6158 9.81055 34.8103 11.5316L37.0205 31.1062C37.231 32.9746 35.8877 34.6602 34.0193 34.8711C33.8927 34.8852 33.765 34.8926 33.6377 34.8926H6.30289C4.92476 34.8926 3.79547 33.7988 3.75094 32.4215L3.11656 12.7742L2.90719 6.4375C2.87633 5.49805 3.61304 4.71133 4.5525 4.68047L4.58023 4.67969H11.0291ZM11.0291 5.10508H4.59078L4.56656 5.10586C3.86187 5.12891 3.30914 5.71914 3.33219 6.42344L3.54195 12.7605L4.17633 32.4078C4.21344 33.5555 5.15445 34.4668 6.30289 34.4668H33.6377C33.749 34.4668 33.8607 34.4605 33.9716 34.448C35.6064 34.2637 36.7822 32.7887 36.5974 31.1539L34.3873 11.5797C34.2173 10.0734 32.9431 8.93516 31.4275 8.93516H13.0209L12.9377 8.61758L12.2638 6.05703C12.1162 5.49609 11.6091 5.10508 11.0291 5.10508Z" fill="#00BEB4"/>
-<path fill-rule="evenodd" clip-rule="evenodd" d="M9.72812 12.7656H36.6539C38.0637 12.7656 39.207 13.9086 39.207 15.3188C39.207 15.4328 39.1992 15.5465 39.184 15.6594L36.9922 31.943C36.7648 33.6324 35.323 34.8934 33.6184 34.8934H6.37969C4.96953 34.8934 3.82617 33.75 3.82617 32.3398C3.82617 32.2102 3.83633 32.0801 3.85586 31.952L6.36367 15.6523C6.61914 13.9914 8.04805 12.7656 9.72812 12.7656Z" fill="#CAF2F0"/>
-<path fill-rule="evenodd" clip-rule="evenodd" d="M8.98438 14.6172H20.4848C20.899 14.6172 21.2348 14.9529 21.2348 15.3672C21.2348 15.7814 20.899 16.1172 20.4848 16.1172H8.98438C8.57013 16.1172 8.23438 15.7814 8.23438 15.3672C8.23438 14.9529 8.57013 14.6172 8.98438 14.6172Z" fill="#00BEB4"/>
-</svg>
--- a/web/src/assets/svg/data-flow/total-files-icon.svg
+++ b/web/src/assets/svg/data-flow/total-files-icon.svg
@ -1,6 +0,0 @@
-<svg width="40" height="40" viewBox="0 0 40 40" fill="none" xmlns="http://www.w3.org/2000/svg">
-<path fill-rule="evenodd" clip-rule="evenodd" d="M11.0291 4.68164C11.8025 4.68164 12.4787 5.20273 12.6752 5.95039L13.3494 8.51133H31.4275C33.1599 8.51133 34.6158 9.8125 34.8103 11.5336L37.0205 31.1082C37.231 32.9766 35.8877 34.6621 34.0193 34.873C33.8927 34.8871 33.765 34.8945 33.6377 34.8945H6.30289C4.92476 34.8945 3.79547 33.8008 3.75094 32.4234L3.11734 12.7766H3.115L2.90719 6.43945C2.87633 5.5 3.61304 4.71328 4.5525 4.68242C4.57086 4.68203 4.58961 4.68164 4.60836 4.68164H11.0291Z" fill="#1F3232"/>
-<path d="M11.0291 4.68164C11.8025 4.68164 12.4787 5.20273 12.6752 5.95039L13.349 8.51133H31.4275C33.1599 8.51133 34.6158 9.8125 34.8103 11.5336L37.0205 31.1082C37.231 32.9766 35.8877 34.6621 34.0193 34.873C33.8927 34.8871 33.765 34.8945 33.6377 34.8945H6.30289C4.92476 34.8945 3.79547 33.8008 3.75094 32.4234L3.11656 12.7762L2.90719 6.43945C2.87633 5.5 3.61304 4.71328 4.5525 4.68242L4.58023 4.68164H11.0291ZM11.0291 5.10703H4.59078L4.56656 5.10781C3.86187 5.13086 3.30914 5.72109 3.33219 6.42539L3.54195 12.7625L4.17633 32.4098C4.21344 33.5574 5.15445 34.4687 6.30289 34.4687H33.6377C33.749 34.4687 33.8607 34.4625 33.9716 34.45C35.6064 34.2656 36.7822 32.7906 36.5974 31.1559L34.3873 11.5816C34.2173 10.0754 32.9431 8.93711 31.4275 8.93711H13.0209L12.9377 8.61953L12.2638 6.05898C12.1162 5.49805 11.6091 5.10703 11.0291 5.10703Z" fill="#1B3B3C"/>
-<path fill-rule="evenodd" clip-rule="evenodd" d="M9.72812 12.7656H36.6539C38.0637 12.7656 39.207 13.9086 39.207 15.3188C39.207 15.4328 39.1992 15.5465 39.184 15.6594L36.9922 31.943C36.7648 33.6324 35.323 34.8934 33.6184 34.8934H6.37969C4.96953 34.8934 3.82617 33.75 3.82617 32.3398C3.82617 32.2102 3.83633 32.0801 3.85586 31.952L6.36367 15.6523C6.61914 13.9914 8.04805 12.7656 9.72812 12.7656Z" fill="#1B3B3C"/>
-<path fill-rule="evenodd" clip-rule="evenodd" d="M8.98438 14.6172H20.4848C20.899 14.6172 21.2348 14.9529 21.2348 15.3672C21.2348 15.7814 20.899 16.1172 20.4848 16.1172H8.98438C8.57013 16.1172 8.23438 15.7814 8.23438 15.3672C8.23438 14.9529 8.57013 14.6172 8.98438 14.6172Z" fill="#00BEB4"/>
-</svg>
--- a/web/src/components/chunk-method-dialog/index.tsx
+++ b/web/src/components/chunk-method-dialog/index.tsx
@ -18,11 +18,8 @@ import { useFetchKnowledgeBaseConfiguration } from '@/hooks/use-knowledge-reques
 import { IModalProps } from '@/interfaces/common';
 import { IParserConfig } from '@/interfaces/database/document';
 import { IChangeParserConfigRequestBody } from '@/interfaces/request/document';
-import {
-  ChunkMethodItem,
-  ParseTypeItem,
-} from '@/pages/dataset/dataset-setting/configuration/common-item';
 import { zodResolver } from '@hookform/resolvers/zod';
+import get from 'lodash/get';
 import omit from 'lodash/omit';
 import {} from 'module';
 import { useEffect, useMemo } from 'react';
@ -33,17 +30,24 @@ import {
  AutoKeywordsFormField,
  AutoQuestionsFormField,
 } from '../auto-keywords-form-field';
-import { DataFlowSelect } from '../data-pipeline-select';
 import { DelimiterFormField } from '../delimiter-form-field';
 import { EntityTypesFormField } from '../entity-types-form-field';
 import { ExcelToHtmlFormField } from '../excel-to-html-form-field';
 import { FormContainer } from '../form-container';
 import { LayoutRecognizeFormField } from '../layout-recognize-form-field';
 import { MaxTokenNumberFormField } from '../max-token-number-from-field';
+import {
+  UseGraphRagFormField,
+  showGraphRagItems,
+} from '../parse-configuration/graph-rag-form-fields';
+import RaptorFormFields, {
+  showRaptorParseConfiguration,
+} from '../parse-configuration/raptor-form-fields';
 import { ButtonLoading } from '../ui/button';
 import { Input } from '../ui/input';
+import { RAGFlowSelect } from '../ui/select';
 import { DynamicPageRange } from './dynamic-page-range';
-import { useShowAutoKeywords } from './hooks';
+import { useFetchParserListOnMount, useShowAutoKeywords } from './hooks';
 import {
  useDefaultParserValues,
  useFillDefaultValueOnMount,
@ -58,7 +62,6 @@ interface IProps
  }> {
  loading: boolean;
  parserId: string;
-  pipelineId?: string;
  parserConfig: IParserConfig;
  documentExtension: string;
  documentId: string;
@ -77,7 +80,6 @@ export function ChunkMethodDialog({
  hideModal,
  onOk,
  parserId,
-  pipelineId,
  documentExtension,
  visible,
  parserConfig,
@ -85,6 +87,8 @@ export function ChunkMethodDialog({
 }: IProps) {
  const { t } = useTranslation();

+  const { parserList } = useFetchParserListOnMount(documentExtension);
+
  const { data: knowledgeDetails } = useFetchKnowledgeBaseConfiguration();

  const useGraphRag = useMemo(() => {
@ -95,59 +99,46 @@ export function ChunkMethodDialog({

  const fillDefaultParserValue = useFillDefaultValueOnMount();

-  const FormSchema = z
-    .object({
-      parseType: z.number(),
-      parser_id: z
-        .string()
-        .min(1, {
-          message: t('common.pleaseSelect'),
+  const FormSchema = z.object({
+    parser_id: z
+      .string()
+      .min(1, {
+        message: t('common.pleaseSelect'),
+      })
+      .trim(),
+    parser_config: z.object({
+      task_page_size: z.coerce.number().optional(),
+      layout_recognize: z.string().optional(),
+      chunk_token_num: z.coerce.number().optional(),
+      delimiter: z.string().optional(),
+      auto_keywords: z.coerce.number().optional(),
+      auto_questions: z.coerce.number().optional(),
+      html4excel: z.boolean().optional(),
+      raptor: z
+        .object({
+          use_raptor: z.boolean().optional(),
+          prompt: z.string().optional().optional(),
+          max_token: z.coerce.number().optional(),
+          threshold: z.coerce.number().optional(),
+          max_cluster: z.coerce.number().optional(),
+          random_seed: z.coerce.number().optional(),
        })
-        .trim(),
-      pipeline_id: z.string().optional(),
-      parser_config: z.object({
-        task_page_size: z.coerce.number().optional(),
-        layout_recognize: z.string().optional(),
-        chunk_token_num: z.coerce.number().optional(),
-        delimiter: z.string().optional(),
-        auto_keywords: z.coerce.number().optional(),
-        auto_questions: z.coerce.number().optional(),
-        html4excel: z.boolean().optional(),
-        // raptor: z
-        //   .object({
-        //     use_raptor: z.boolean().optional(),
-        //     prompt: z.string().optional().optional(),
-        //     max_token: z.coerce.number().optional(),
-        //     threshold: z.coerce.number().optional(),
-        //     max_cluster: z.coerce.number().optional(),
-        //     random_seed: z.coerce.number().optional(),
-        //   })
-        //   .optional(),
-        // graphrag: z.object({
-        //   use_graphrag: z.boolean().optional(),
-        // }),
-        entity_types: z.array(z.string()).optional(),
-        pages: z
-          .array(z.object({ from: z.coerce.number(), to: z.coerce.number() }))
-          .optional(),
+        .optional(),
+      graphrag: z.object({
+        use_graphrag: z.boolean().optional(),
      }),
-    })
-    .superRefine((data, ctx) => {
-      if (data.parseType === 2 && !data.pipeline_id) {
-        ctx.addIssue({
-          path: ['pipeline_id'],
-          message: t('common.pleaseSelect'),
-          code: 'custom',
-        });
-      }
-    });
+      entity_types: z.array(z.string()).optional(),
+      pages: z
+        .array(z.object({ from: z.coerce.number(), to: z.coerce.number() }))
+        .optional(),
+    }),
+  });

  const form = useForm<z.infer<typeof FormSchema>>({
    resolver: zodResolver(FormSchema),
    defaultValues: {
-      parser_id: parserId || '',
-      pipeline_id: pipelineId || '',
-      parseType: pipelineId ? 2 : 1,
+      parser_id: parserId,
+
      parser_config: defaultParserValues,
    },
  });
@ -209,19 +200,17 @@ export function ChunkMethodDialog({
      const pages =
        parserConfig?.pages?.map((x) => ({ from: x[0], to: x[1] })) ?? [];
      form.reset({
-        parser_id: parserId || '',
-        pipeline_id: pipelineId || '',
-        parseType: pipelineId ? 2 : 1,
+        parser_id: parserId,
        parser_config: fillDefaultParserValue({
          pages: pages.length > 0 ? pages : [{ from: 1, to: 1024 }],
          ...omit(parserConfig, 'pages'),
-          // graphrag: {
-          //   use_graphrag: get(
-          //     parserConfig,
-          //     'graphrag.use_graphrag',
-          //     useGraphRag,
-          //   ),
-          // },
+          graphrag: {
+            use_graphrag: get(
+              parserConfig,
+              'graphrag.use_graphrag',
+              useGraphRag,
+            ),
+          },
        }),
      });
    }
@ -231,20 +220,10 @@ export function ChunkMethodDialog({
    knowledgeDetails.parser_config,
    parserConfig,
    parserId,
-    pipelineId,
    useGraphRag,
    visible,
  ]);
-  const parseType = useWatch({
-    control: form.control,
-    name: 'parseType',
-    defaultValue: pipelineId ? 2 : 1,
-  });
-  useEffect(() => {
-    if (parseType === 1) {
-      form.setValue('pipeline_id', '');
-    }
-  }, [parseType, form]);
+
  return (
    <Dialog open onOpenChange={hideModal}>
      <DialogContent className="max-w-[50vw]">
@ -258,17 +237,7 @@ export function ChunkMethodDialog({
            id={FormId}
          >
            <FormContainer>
-              <ParseTypeItem />
-              {parseType === 1 && <ChunkMethodItem></ChunkMethodItem>}
-              {parseType === 2 && (
-                <DataFlowSelect
-                  isMult={false}
-                  // toDataPipeline={navigateToAgents}
-                  formFieldName="pipeline_id"
-                />
-              )}
-
-              {/* <FormField
+              <FormField
                control={form.control}
                name="parser_id"
                render={({ field }) => (
@ -283,11 +252,9 @@ export function ChunkMethodDialog({
                    <FormMessage />
                  </FormItem>
                )}
-              /> */}
-              {showPages && parseType === 1 && (
-                <DynamicPageRange></DynamicPageRange>
-              )}
-              {showPages && parseType === 1 && layoutRecognize && (
+              />
+              {showPages && <DynamicPageRange></DynamicPageRange>}
+              {showPages && layoutRecognize && (
                <FormField
                  control={form.control}
                  name="parser_config.task_page_size"
@ -312,60 +279,50 @@ export function ChunkMethodDialog({
                />
              )}
            </FormContainer>
-            {parseType === 1 && (
-              <>
-                <FormContainer
-                  show={showOne || showMaxTokenNumber}
-                  className="space-y-3"
-                >
-                  {showOne && (
-                    <LayoutRecognizeFormField></LayoutRecognizeFormField>
-                  )}
-                  {showMaxTokenNumber && (
-                    <>
-                      <MaxTokenNumberFormField
-                        max={
-                          selectedTag === DocumentParserType.KnowledgeGraph
-                            ? 8192 * 2
-                            : 2048
-                        }
-                      ></MaxTokenNumberFormField>
-                      <DelimiterFormField></DelimiterFormField>
-                    </>
-                  )}
-                </FormContainer>
-                <FormContainer
-                  show={showAutoKeywords(selectedTag) || showExcelToHtml}
-                  className="space-y-3"
-                >
-                  {showAutoKeywords(selectedTag) && (
-                    <>
-                      <AutoKeywordsFormField></AutoKeywordsFormField>
-                      <AutoQuestionsFormField></AutoQuestionsFormField>
-                    </>
-                  )}
-                  {showExcelToHtml && (
-                    <ExcelToHtmlFormField></ExcelToHtmlFormField>
-                  )}
-                </FormContainer>
-                {/* {showRaptorParseConfiguration(
-                  selectedTag as DocumentParserType,
-                ) && (
-                  <FormContainer>
-                    <RaptorFormFields></RaptorFormFields>
-                  </FormContainer>
-                )} */}
-                {/* {showGraphRagItems(selectedTag as DocumentParserType) &&
-                  useGraphRag && (
-                    <FormContainer>
-                      <UseGraphRagFormField></UseGraphRagFormField>
-                    </FormContainer>
-                  )} */}
-                {showEntityTypes && (
-                  <EntityTypesFormField></EntityTypesFormField>
-                )}
-              </>
+            <FormContainer
+              show={showOne || showMaxTokenNumber}
+              className="space-y-3"
+            >
+              {showOne && <LayoutRecognizeFormField></LayoutRecognizeFormField>}
+              {showMaxTokenNumber && (
+                <>
+                  <MaxTokenNumberFormField
+                    max={
+                      selectedTag === DocumentParserType.KnowledgeGraph
+                        ? 8192 * 2
+                        : 2048
+                    }
+                  ></MaxTokenNumberFormField>
+                  <DelimiterFormField></DelimiterFormField>
+                </>
+              )}
+            </FormContainer>
+            <FormContainer
+              show={showAutoKeywords(selectedTag) || showExcelToHtml}
+              className="space-y-3"
+            >
+              {showAutoKeywords(selectedTag) && (
+                <>
+                  <AutoKeywordsFormField></AutoKeywordsFormField>
+                  <AutoQuestionsFormField></AutoQuestionsFormField>
+                </>
+              )}
+              {showExcelToHtml && <ExcelToHtmlFormField></ExcelToHtmlFormField>}
+            </FormContainer>
+            {showRaptorParseConfiguration(
+              selectedTag as DocumentParserType,
+            ) && (
+              <FormContainer>
+                <RaptorFormFields></RaptorFormFields>
+              </FormContainer>
            )}
+            {showGraphRagItems(selectedTag as DocumentParserType) &&
+              useGraphRag && (
+                <FormContainer>
+                  <UseGraphRagFormField></UseGraphRagFormField>
+                </FormContainer>
+              )}
+            {showEntityTypes && <EntityTypesFormField></EntityTypesFormField>}
          </form>
        </Form>
        <DialogFooter>
--- a/web/src/components/chunk-method-dialog/use-default-parser-values.ts
+++ b/web/src/components/chunk-method-dialog/use-default-parser-values.ts
@ -1,7 +1,7 @@
 import { IParserConfig } from '@/interfaces/database/document';
 import { useCallback, useMemo } from 'react';
 import { useTranslation } from 'react-i18next';
-import { ParseDocumentType } from '../layout-recognize-form-field';
+import { DocumentType } from '../layout-recognize-form-field';

 export function useDefaultParserValues() {
  const { t } = useTranslation();
@ -9,23 +9,23 @@ export function useDefaultParserValues() {
  const defaultParserValues = useMemo(() => {
    const defaultParserValues = {
      task_page_size: 12,
-      layout_recognize: ParseDocumentType.DeepDOC,
+      layout_recognize: DocumentType.DeepDOC,
      chunk_token_num: 512,
      delimiter: '\n',
      auto_keywords: 0,
      auto_questions: 0,
      html4excel: false,
-      // raptor: {
-      //   use_raptor: false,
-      //   prompt: t('knowledgeConfiguration.promptText'),
-      //   max_token: 256,
-      //   threshold: 0.1,
-      //   max_cluster: 64,
-      //   random_seed: 0,
-      // },
-      // graphrag: {
-      //   use_graphrag: false,
-      // },
+      raptor: {
+        use_raptor: false,
+        prompt: t('knowledgeConfiguration.promptText'),
+        max_token: 256,
+        threshold: 0.1,
+        max_cluster: 64,
+        random_seed: 0,
+      },
+      graphrag: {
+        use_graphrag: false,
+      },
      entity_types: [],
      pages: [],
    };
--- a/web/src/components/confirm-delete-dialog.tsx
+++ b/web/src/components/confirm-delete-dialog.tsx
@ -8,7 +8,7 @@ import {
  AlertDialogTitle,
  AlertDialogTrigger,
 } from '@/components/ui/alert-dialog';
-import { DialogProps } from '@radix-ui/react-dialog';
+import { PropsWithChildren } from 'react';
 import { useTranslation } from 'react-i18next';

 interface IProps {
@ -24,10 +24,7 @@ export function ConfirmDeleteDialog({
  onOk,
  onCancel,
  hidden = false,
-  onOpenChange,
-  open,
-  defaultOpen,
-}: IProps & DialogProps) {
+}: IProps & PropsWithChildren) {
  const { t } = useTranslation();

  if (hidden) {
@ -35,11 +32,7 @@ export function ConfirmDeleteDialog({
  }

  return (
-    <AlertDialog
-      onOpenChange={onOpenChange}
-      open={open}
-      defaultOpen={defaultOpen}
-    >
+    <AlertDialog>
      <AlertDialogTrigger asChild>{children}</AlertDialogTrigger>
      <AlertDialogContent
        onSelect={(e) => e.preventDefault()}
--- a/web/src/components/cross-language-form-field.tsx
+++ b/web/src/components/cross-language-form-field.tsx
@ -22,7 +22,7 @@ const Languages = [
  'Vietnamese',
 ];

-export const crossLanguageOptions = Languages.map((x) => ({
+const options = Languages.map((x) => ({
  label: t('language.' + toLower(x)),
  value: x,
 }));
@ -30,13 +30,11 @@ export const crossLanguageOptions = Languages.map((x) => ({
 type CrossLanguageItemProps = {
  name?: string;
  vertical?: boolean;
-  label?: string;
 };

 export const CrossLanguageFormField = ({
  name = 'prompt_config.cross_languages',
  vertical = true,
-  label,
 }: CrossLanguageItemProps) => {
  const { t } = useTranslation();
  const form = useFormContext();
@ -55,11 +53,11 @@ export const CrossLanguageFormField = ({
          })}
        >
          <FormLabel tooltip={t('chat.crossLanguageTip')}>
-            {label || t('chat.crossLanguage')}
+            {t('chat.crossLanguage')}
          </FormLabel>
          <FormControl>
            <MultiSelect
-              options={crossLanguageOptions}
+              options={options}
              placeholder={t('fileManager.pleaseSelect')}
              maxCount={100}
              {...field}
--- a/web/src/components/data-pipeline-select/index.tsx
+++ b/web/src/components/data-pipeline-select/index.tsx
@ -1,120 +0,0 @@
-import { AgentCategory } from '@/constants/agent';
-import { useTranslate } from '@/hooks/common-hooks';
-import { useFetchAgentList } from '@/hooks/use-agent-request';
-import { buildSelectOptions } from '@/utils/component-util';
-import { ArrowUpRight } from 'lucide-react';
-import { useEffect, useMemo } from 'react';
-import { useFormContext } from 'react-hook-form';
-import { SelectWithSearch } from '../originui/select-with-search';
-import {
-  FormControl,
-  FormField,
-  FormItem,
-  FormLabel,
-  FormMessage,
-} from '../ui/form';
-import { MultiSelect } from '../ui/multi-select';
-export interface IDataPipelineSelectNode {
-  id?: string;
-  name?: string;
-  avatar?: string;
-}
-
-interface IProps {
-  toDataPipeline?: () => void;
-  formFieldName: string;
-  isMult?: boolean;
-  setDataList?: (data: IDataPipelineSelectNode[]) => void;
-}
-
-export function DataFlowSelect(props: IProps) {
-  const { toDataPipeline, formFieldName, isMult = false, setDataList } = props;
-  const { t } = useTranslate('knowledgeConfiguration');
-  const form = useFormContext();
-  const toDataPipLine = () => {
-    toDataPipeline?.();
-  };
-  const { data: dataPipelineOptions } = useFetchAgentList({
-    canvas_category: AgentCategory.DataflowCanvas,
-  });
-  const options = useMemo(() => {
-    const option = buildSelectOptions(
-      dataPipelineOptions?.canvas,
-      'id',
-      'title',
-    );
-
-    return option || [];
-  }, [dataPipelineOptions]);
-
-  const nodes = useMemo(() => {
-    return (
-      dataPipelineOptions?.canvas?.map((item) => {
-        return {
-          id: item?.id,
-          name: item?.title,
-          avatar: item?.avatar,
-        };
-      }) || []
-    );
-  }, [dataPipelineOptions]);
-
-  useEffect(() => {
-    setDataList?.(nodes);
-  }, [nodes, setDataList]);
-
-  return (
-    <FormField
-      control={form.control}
-      name={formFieldName}
-      render={({ field }) => (
-        <FormItem className=" items-center space-y-0 ">
-          <div className="flex flex-col gap-1">
-            <div className="flex gap-2 justify-between ">
-              <FormLabel
-                tooltip={t('dataFlowTip')}
-                className="text-sm text-text-primary whitespace-wrap "
-              >
-                {t('dataPipeline')}
-              </FormLabel>
-              {toDataPipeline && (
-                <div
-                  className="text-sm flex text-text-primary cursor-pointer"
-                  onClick={toDataPipLine}
-                >
-                  {t('buildItFromScratch')}
-                  <ArrowUpRight size={14} />
-                </div>
-              )}
-            </div>
-
-            <div className="text-muted-foreground">
-              <FormControl>
-                <>
-                  {!isMult && (
-                    <SelectWithSearch
-                      {...field}
-                      placeholder={t('dataFlowPlaceholder')}
-                      options={options}
-                    />
-                  )}
-                  {isMult && (
-                    <MultiSelect
-                      {...field}
-                      onValueChange={field.onChange}
-                      placeholder={t('dataFlowPlaceholder')}
-                      options={options}
-                    />
-                  )}
-                </>
-              </FormControl>
-            </div>
-          </div>
-          <div className="flex pt-1">
-            <FormMessage />
-          </div>
-        </FormItem>
-      )}
-    />
-  );
-}
--- a/web/src/components/delimiter-form-field.tsx
+++ b/web/src/components/delimiter-form-field.tsx
@ -16,17 +16,11 @@ interface IProps {
 }

 export const DelimiterInput = forwardRef<HTMLInputElement, InputProps & IProps>(
-  ({ value, onChange, maxLength, defaultValue, ...props }, ref) => {
-    const nextValue = value
-      ?.replaceAll('\n', '\\n')
-      .replaceAll('\t', '\\t')
-      .replaceAll('\r', '\\r');
+  ({ value, onChange, maxLength, defaultValue }, ref) => {
+    const nextValue = value?.replaceAll('\n', '\\n');
    const handleInputChange = (e: React.ChangeEvent<HTMLInputElement>) => {
      const val = e.target.value;
-      const nextValue = val
-        .replaceAll('\\n', '\n')
-        .replaceAll('\\t', '\t')
-        .replaceAll('\\r', '\r');
+      const nextValue = val.replaceAll('\\n', '\n');
      onChange?.(nextValue);
    };
    return (
@ -36,7 +30,6 @@ export const DelimiterInput = forwardRef<HTMLInputElement, InputProps & IProps>(
        maxLength={maxLength}
        defaultValue={defaultValue}
        ref={ref}
-        {...props}
      ></Input>
    );
  },
--- a/Show More
+++ b/Show More
				`@ -0,0 +1 @@`
				<?xml version="1.0" standalone="no"?><!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd"><svg t="1756884949583" class="icon" viewBox="0 0 1024 1024" version="1.1" xmlns="http://www.w3.org/2000/svg" p-id="11332" xmlns:xlink="http://www.w3.org/1999/xlink" width="200" height="200"><path d="M190.464 489.472h327.68v40.96h-327.68z" fill="#C7DCFE" p-id="11333"></path><path d="M482.34496 516.5056l111.26784-308.20352 38.54336 13.9264L520.86784 530.432z" fill="#C7DCFE" p-id="11334"></path><path d="M620.544 196.608m-122.88 0a122.88 122.88 0 1 0 245.76 0 122.88 122.88 0 1 0-245.76 0Z" fill="#8FB8FC" p-id="11335"></path><path d="M182.272 509.952m-122.88 0a122.88 122.88 0 1 0 245.76 0 122.88 122.88 0 1 0-245.76 0Z" fill="#C7DCFE" p-id="11336"></path><path d="M558.65344 520.9088l283.77088 163.84-20.48 35.47136-283.77088-163.84z" fill="#C7DCFE" p-id="11337"></path><path d="M841.728 686.08m-122.88 0a122.88 122.88 0 1 0 245.76 0 122.88 122.88 0 1 0-245.76 0Z" fill="#B3CEFE" p-id="11338"></path><path d="M448.67584 803.77856l49.60256-323.91168 40.48896 6.20544-49.60256 323.91168z" fill="#C7DCFE" p-id="11339"></path><path d="M512 530.432m-143.36 0a143.36 143.36 0 1 0 286.72 0 143.36 143.36 0 1 0-286.72 0Z" fill="#4185FF" p-id="11340"></path><path d="M462.848 843.776m-102.4 0a102.4 102.4 0 1 0 204.8 0 102.4 102.4 0 1 0-204.8 0Z" fill="#8FB8FC" p-id="11341"></path></svg>