Feat: Use data pipeline to visualize the parsing configuration of the knowledge base (#10423)

### What problem does this PR solve? #9869 ### Type of change - [x] New Feature (non-breaking change which adds functionality) --------- Signed-off-by: dependabot[bot] <support@github.com> Signed-off-by: jinhai <haijin.chn@gmail.com> Signed-off-by: Jin Hai <haijin.chn@gmail.com> Co-authored-by: chanx <1243304602@qq.com> Co-authored-by: balibabu <cike8899@users.noreply.github.com> Co-authored-by: Lynn <lynn_inf@hotmail.com> Co-authored-by: 纷繁下的无奈 <zhileihuang@126.com> Co-authored-by: huangzl <huangzl@shinemo.com> Co-authored-by: writinwaters <93570324+writinwaters@users.noreply.github.com> Co-authored-by: Wilmer <33392318@qq.com> Co-authored-by: Adrian Weidig <adrianweidig@gmx.net> Co-authored-by: Zhichang Yu <yuzhichang@gmail.com> Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> Co-authored-by: Yongteng Lei <yongtengrey@outlook.com> Co-authored-by: Liu An <asiro@qq.com> Co-authored-by: buua436 <66937541+buua436@users.noreply.github.com> Co-authored-by: BadwomanCraZY <511528396@qq.com> Co-authored-by: cucusenok <31804608+cucusenok@users.noreply.github.com> Co-authored-by: Russell Valentine <russ@coldstonelabs.org> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: Billy Bao <newyorkupperbay@gmail.com> Co-authored-by: Zhedong Cen <cenzhedong2@126.com> Co-authored-by: TensorNull <129579691+TensorNull@users.noreply.github.com> Co-authored-by: TensorNull <tensor.null@gmail.com> Co-authored-by: TeslaZY <TeslaZY@outlook.com> Co-authored-by: Ajay <160579663+aybanda@users.noreply.github.com> Co-authored-by: AB <aj@Ajays-MacBook-Air.local> Co-authored-by: 天海蒼灆 <huangaoqin@tecpie.com> Co-authored-by: He Wang <wanghechn@qq.com> Co-authored-by: Atsushi Hatakeyama <atu729@icloud.com> Co-authored-by: Jin Hai <haijin.chn@gmail.com> Co-authored-by: Mohamed Mathari <155896313+melmathari@users.noreply.github.com> Co-authored-by: Mohamed Mathari <nocodeventure@Mac-mini-van-Mohamed.fritz.box> Co-authored-by: Stephen Hu <stephenhu@seismic.com> Co-authored-by: Shaun Zhang <zhangwfjh@users.noreply.github.com> Co-authored-by: zhimeng123 <60221886+zhimeng123@users.noreply.github.com> Co-authored-by: mxc <mxc@example.com> Co-authored-by: Dominik Novotný <50611433+SgtMarmite@users.noreply.github.com> Co-authored-by: EVGENY M <168018528+rjohny55@users.noreply.github.com> Co-authored-by: mcoder6425 <mcoder64@gmail.com> Co-authored-by: lemsn <lemsn@msn.com> Co-authored-by: lemsn <lemsn@126.com> Co-authored-by: Adrian Gora <47756404+adagora@users.noreply.github.com> Co-authored-by: Womsxd <45663319+Womsxd@users.noreply.github.com> Co-authored-by: FatMii <39074672+FatMii@users.noreply.github.com>
2025-12-08 12:32:30 +08:00 · 2025-10-09 12:36:19 +08:00
parent ef0aecea3b
commit cbf04ee470
490 changed files with 10630 additions and 30688 deletions
--- a/rag/flow/base.py
+++ b/rag/flow/base.py
@ -18,9 +18,7 @@ import os
 import time
 from functools import partial
 from typing import Any
-
 import trio
-
 from agent.component.base import ComponentBase, ComponentParamBase
 from api.utils.api_utils import timeout

@ -36,9 +34,9 @@ class ProcessBase(ComponentBase):
    def __init__(self, pipeline, id, param: ProcessParamBase):
        super().__init__(pipeline, id, param)
        if hasattr(self._canvas, "callback"):
-            self.callback = partial(self._canvas.callback, self.component_name)
+            self.callback = partial(self._canvas.callback, id)
        else:
-            self.callback = partial(lambda *args, **kwargs: None, self.component_name)
+            self.callback = partial(lambda *args, **kwargs: None, id)

    async def invoke(self, **kwargs) -> dict[str, Any]:
        self.set_output("_created_time", time.perf_counter())
--- a/rag/flow/chunker/chunker.py
+++ b/rag/flow/chunker/chunker.py
@ -1,212 +0,0 @@
-#
-#  Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-import random
-
-import trio
-
-from api.db import LLMType
-from api.db.services.llm_service import LLMBundle
-from deepdoc.parser.pdf_parser import RAGFlowPdfParser
-from graphrag.utils import chat_limiter, get_llm_cache, set_llm_cache
-from rag.flow.base import ProcessBase, ProcessParamBase
-from rag.flow.chunker.schema import ChunkerFromUpstream
-from rag.nlp import naive_merge, naive_merge_with_images
-from rag.prompts.generator import keyword_extraction, question_proposal
-
-
-class ChunkerParam(ProcessParamBase):
-    def __init__(self):
-        super().__init__()
-        self.method_options = [
-            # General
-            "general",
-            "onetable",
-            # Customer Service
-            "q&a",
-            "manual",
-            # Recruitment
-            "resume",
-            # Education & Research
-            "book",
-            "paper",
-            "laws",
-            "presentation",
-            # Other
-            # "Tag" # TODO: Other method
-        ]
-        self.method = "general"
-        self.chunk_token_size = 512
-        self.delimiter = "\n"
-        self.overlapped_percent = 0
-        self.page_rank = 0
-        self.auto_keywords = 0
-        self.auto_questions = 0
-        self.tag_sets = []
-        self.llm_setting = {"llm_name": "", "lang": "Chinese"}
-
-    def check(self):
-        self.check_valid_value(self.method.lower(), "Chunk method abnormal.", self.method_options)
-        self.check_positive_integer(self.chunk_token_size, "Chunk token size.")
-        self.check_nonnegative_number(self.page_rank, "Page rank value: (0, 10]")
-        self.check_nonnegative_number(self.auto_keywords, "Auto-keyword value: (0, 10]")
-        self.check_nonnegative_number(self.auto_questions, "Auto-question value: (0, 10]")
-        self.check_decimal_float(self.overlapped_percent, "Overlapped percentage: [0, 1)")
-
-    def get_input_form(self) -> dict[str, dict]:
-        return {}
-
-
-class Chunker(ProcessBase):
-    component_name = "Chunker"
-
-    def _general(self, from_upstream: ChunkerFromUpstream):
-        self.callback(random.randint(1, 5) / 100.0, "Start to chunk via `General`.")
-        if from_upstream.output_format in ["markdown", "text", "html"]:
-            if from_upstream.output_format == "markdown":
-                payload = from_upstream.markdown_result
-            elif from_upstream.output_format == "text":
-                payload = from_upstream.text_result
-            else:  # == "html"
-                payload = from_upstream.html_result
-
-            if not payload:
-                payload = ""
-
-            cks = naive_merge(
-                payload,
-                self._param.chunk_token_size,
-                self._param.delimiter,
-                self._param.overlapped_percent,
-            )
-            return [{"text": c} for c in cks]
-
-        # json
-        sections, section_images = [], []
-        for o in from_upstream.json_result or []:
-            sections.append((o.get("text", ""), o.get("position_tag", "")))
-            section_images.append(o.get("image"))
-
-        chunks, images = naive_merge_with_images(
-            sections,
-            section_images,
-            self._param.chunk_token_size,
-            self._param.delimiter,
-            self._param.overlapped_percent,
-        )
-
-        return [
-            {
-                "text": RAGFlowPdfParser.remove_tag(c),
-                "image": img,
-                "positions": RAGFlowPdfParser.extract_positions(c),
-            }
-            for c, img in zip(chunks, images)
-        ]
-
-    def _q_and_a(self, from_upstream: ChunkerFromUpstream):
-        pass
-
-    def _resume(self, from_upstream: ChunkerFromUpstream):
-        pass
-
-    def _manual(self, from_upstream: ChunkerFromUpstream):
-        pass
-
-    def _table(self, from_upstream: ChunkerFromUpstream):
-        pass
-
-    def _paper(self, from_upstream: ChunkerFromUpstream):
-        pass
-
-    def _book(self, from_upstream: ChunkerFromUpstream):
-        pass
-
-    def _laws(self, from_upstream: ChunkerFromUpstream):
-        pass
-
-    def _presentation(self, from_upstream: ChunkerFromUpstream):
-        pass
-
-    def _one(self, from_upstream: ChunkerFromUpstream):
-        pass
-
-    async def _invoke(self, **kwargs):
-        function_map = {
-            "general": self._general,
-            "q&a": self._q_and_a,
-            "resume": self._resume,
-            "manual": self._manual,
-            "table": self._table,
-            "paper": self._paper,
-            "book": self._book,
-            "laws": self._laws,
-            "presentation": self._presentation,
-            "one": self._one,
-        }
-
-        try:
-            from_upstream = ChunkerFromUpstream.model_validate(kwargs)
-        except Exception as e:
-            self.set_output("_ERROR", f"Input error: {str(e)}")
-            return
-
-        chunks = function_map[self._param.method](from_upstream)
-        llm_setting = self._param.llm_setting
-
-        async def auto_keywords():
-            nonlocal chunks, llm_setting
-            chat_mdl = LLMBundle(self._canvas._tenant_id, LLMType.CHAT, llm_name=llm_setting["llm_name"], lang=llm_setting["lang"])
-
-            async def doc_keyword_extraction(chat_mdl, ck, topn):
-                cached = get_llm_cache(chat_mdl.llm_name, ck["text"], "keywords", {"topn": topn})
-                if not cached:
-                    async with chat_limiter:
-                        cached = await trio.to_thread.run_sync(lambda: keyword_extraction(chat_mdl, ck["text"], topn))
-                    set_llm_cache(chat_mdl.llm_name, ck["text"], cached, "keywords", {"topn": topn})
-                if cached:
-                    ck["keywords"] = cached.split(",")
-
-            async with trio.open_nursery() as nursery:
-                for ck in chunks:
-                    nursery.start_soon(doc_keyword_extraction, chat_mdl, ck, self._param.auto_keywords)
-
-        async def auto_questions():
-            nonlocal chunks, llm_setting
-            chat_mdl = LLMBundle(self._canvas._tenant_id, LLMType.CHAT, llm_name=llm_setting["llm_name"], lang=llm_setting["lang"])
-
-            async def doc_question_proposal(chat_mdl, d, topn):
-                cached = get_llm_cache(chat_mdl.llm_name, ck["text"], "question", {"topn": topn})
-                if not cached:
-                    async with chat_limiter:
-                        cached = await trio.to_thread.run_sync(lambda: question_proposal(chat_mdl, ck["text"], topn))
-                    set_llm_cache(chat_mdl.llm_name, ck["text"], cached, "question", {"topn": topn})
-                if cached:
-                    d["questions"] = cached.split("\n")
-
-            async with trio.open_nursery() as nursery:
-                for ck in chunks:
-                    nursery.start_soon(doc_question_proposal, chat_mdl, ck, self._param.auto_questions)
-
-        async with trio.open_nursery() as nursery:
-            if self._param.auto_questions:
-                nursery.start_soon(auto_questions)
-            if self._param.auto_keywords:
-                nursery.start_soon(auto_keywords)
-
-        if self._param.page_rank:
-            for ck in chunks:
-                ck["page_rank"] = self._param.page_rank
-
-        self.set_output("chunks", chunks)
--- a/rag/flow/extractor/init.py
+++ b/rag/flow/extractor/init.py
--- a/rag/flow/extractor/extractor.py
+++ b/rag/flow/extractor/extractor.py
@ -0,0 +1,63 @@
+#
+#  Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+import random
+from copy import deepcopy
+from agent.component.llm import LLMParam, LLM
+from rag.flow.base import ProcessBase, ProcessParamBase
+
+
+class ExtractorParam(ProcessParamBase, LLMParam):
+    def __init__(self):
+        super().__init__()
+        self.field_name = ""
+
+    def check(self):
+        super().check()
+        self.check_empty(self.field_name, "Result Destination")
+
+
+class Extractor(ProcessBase, LLM):
+    component_name = "Extractor"
+
+    async def _invoke(self, **kwargs):
+        self.set_output("output_format", "chunks")
+        self.callback(random.randint(1, 5) / 100.0, "Start to generate.")
+        inputs = self.get_input_elements()
+        chunks = []
+        chunks_key = ""
+        args = {}
+        for k, v in inputs.items():
+            args[k] = v["value"]
+            if isinstance(args[k], list):
+                chunks = deepcopy(args[k])
+                chunks_key = k
+
+        if chunks:
+            prog = 0
+            for i, ck in enumerate(chunks):
+                args[chunks_key] = ck["text"]
+                msg, sys_prompt = self._sys_prompt_and_msg([], args)
+                msg.insert(0, {"role": "system", "content": sys_prompt})
+                ck[self._param.field_name] = self._generate(msg)
+                prog += 1./len(chunks)
+                if i % (len(chunks)//100+1) == 1:
+                    self.callback(prog, f"{i+1} / {len(chunks)}")
+            self.set_output("chunks", chunks)
+        else:
+            msg, sys_prompt = self._sys_prompt_and_msg([], args)
+            msg.insert(0, {"role": "system", "content": sys_prompt})
+            self.set_output("chunks", [{self._param.field_name: self._generate(msg)}])
+
+
--- a/rag/flow/extractor/schema.py
+++ b/rag/flow/extractor/schema.py
@ -0,0 +1,38 @@
+#
+#  Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+from typing import Any, Literal
+
+from pydantic import BaseModel, ConfigDict, Field
+
+
+class ExtractorFromUpstream(BaseModel):
+    created_time: float | None = Field(default=None, alias="_created_time")
+    elapsed_time: float | None = Field(default=None, alias="_elapsed_time")
+
+    name: str
+    file: dict | None = Field(default=None)
+    chunks: list[dict[str, Any]] | None = Field(default=None)
+
+    output_format: Literal["json", "markdown", "text", "html", "chunks"] | None = Field(default=None)
+
+    json_result: list[dict[str, Any]] | None = Field(default=None, alias="json")
+    markdown_result: str | None = Field(default=None, alias="markdown")
+    text_result: str | None = Field(default=None, alias="text")
+    html_result: str | None = Field(default=None, alias="html")
+
+    model_config = ConfigDict(populate_by_name=True, extra="forbid")
+
+    # def to_dict(self, *, exclude_none: bool = True) -> dict:
+    #     return self.model_dump(by_alias=True, exclude_none=exclude_none)
--- a/rag/flow/file.py
+++ b/rag/flow/file.py
@ -14,10 +14,7 @@
 #  limitations under the License.
 #
 from api.db.services.document_service import DocumentService
-from api.db.services.file2document_service import File2DocumentService
-from api.db.services.file_service import FileService
 from rag.flow.base import ProcessBase, ProcessParamBase
-from rag.utils.storage_factory import STORAGE_IMPL


 class FileParam(ProcessParamBase):
@ -41,10 +38,13 @@ class File(ProcessBase):
                self.set_output("_ERROR", f"Document({self._canvas._doc_id}) not found!")
                return

-            b, n = File2DocumentService.get_storage_address(doc_id=self._canvas._doc_id)
-            self.set_output("blob", STORAGE_IMPL.get(b, n))
+            #b, n = File2DocumentService.get_storage_address(doc_id=self._canvas._doc_id)
+            #self.set_output("blob", STORAGE_IMPL.get(b, n))
            self.set_output("name", doc.name)
        else:
            file = kwargs.get("file")
            self.set_output("name", file["name"])
-            self.set_output("blob", FileService.get_blob(file["created_by"], file["id"]))
+            self.set_output("file", file)
+            #self.set_output("blob", FileService.get_blob(file["created_by"], file["id"]))
+
+        self.callback(1, "File fetched.")
--- a/rag/flow/hierarchical_merger/init.py
+++ b/rag/flow/hierarchical_merger/init.py
@ -0,0 +1,15 @@
+#
+#  Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
--- a/rag/flow/hierarchical_merger/hierarchical_merger.py
+++ b/rag/flow/hierarchical_merger/hierarchical_merger.py
@ -0,0 +1,186 @@
+#
+#  Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+import random
+import re
+from copy import deepcopy
+from functools import partial
+
+import trio
+
+from api.utils import get_uuid
+from api.utils.base64_image import id2image, image2id
+from deepdoc.parser.pdf_parser import RAGFlowPdfParser
+from rag.flow.base import ProcessBase, ProcessParamBase
+from rag.flow.hierarchical_merger.schema import HierarchicalMergerFromUpstream
+from rag.nlp import concat_img
+from rag.utils.storage_factory import STORAGE_IMPL
+
+
+class HierarchicalMergerParam(ProcessParamBase):
+    def __init__(self):
+        super().__init__()
+        self.levels = []
+        self.hierarchy = None
+
+    def check(self):
+        self.check_empty(self.levels, "Hierarchical setups.")
+        self.check_empty(self.hierarchy, "Hierarchy number.")
+
+    def get_input_form(self) -> dict[str, dict]:
+        return {}
+
+
+class HierarchicalMerger(ProcessBase):
+    component_name = "HierarchicalMerger"
+
+    async def _invoke(self, **kwargs):
+        try:
+            from_upstream = HierarchicalMergerFromUpstream.model_validate(kwargs)
+        except Exception as e:
+            self.set_output("_ERROR", f"Input error: {str(e)}")
+            return
+
+        self.set_output("output_format", "chunks")
+        self.callback(random.randint(1, 5) / 100.0, "Start to merge hierarchically.")
+        if from_upstream.output_format in ["markdown", "text", "html"]:
+            if from_upstream.output_format == "markdown":
+                payload = from_upstream.markdown_result
+            elif from_upstream.output_format == "text":
+                payload = from_upstream.text_result
+            else:  # == "html"
+                payload = from_upstream.html_result
+
+            if not payload:
+                payload = ""
+
+            lines = [ln for ln in payload.split("\n") if ln]
+        else:
+            arr = from_upstream.chunks if from_upstream.output_format == "chunks" else from_upstream.json_result
+            lines = [o.get("text", "") for o in arr]
+            sections, section_images = [], []
+            for o in arr or []:
+                sections.append((o.get("text", ""), o.get("position_tag", "")))
+                section_images.append(o.get("img_id"))
+
+        matches = []
+        for txt in lines:
+            good = False
+            for lvl, regs in enumerate(self._param.levels):
+                for reg in regs:
+                    if re.search(reg, txt):
+                        matches.append(lvl)
+                        good = True
+                        break
+                if good:
+                    break
+            if not good:
+                matches.append(len(self._param.levels))
+        assert len(matches) == len(lines), f"{len(matches)} vs. {len(lines)}"
+
+        root = {
+            "level": -1,
+            "index": -1,
+            "texts": [],
+            "children": []
+        }
+        for i, m in enumerate(matches):
+            if m == 0:
+                root["children"].append({
+                    "level": m,
+                    "index": i,
+                    "texts": [],
+                    "children": []
+                })
+            elif m == len(self._param.levels):
+                def dfs(b):
+                    if not b["children"]:
+                        b["texts"].append(i)
+                    else:
+                        dfs(b["children"][-1])
+                dfs(root)
+            else:
+                def dfs(b):
+                    nonlocal m, i
+                    if not b["children"] or  m == b["level"] + 1:
+                        b["children"].append({
+                            "level": m,
+                            "index": i,
+                            "texts": [],
+                            "children": []
+                        })
+                        return
+                    dfs(b["children"][-1])
+
+                dfs(root)
+
+        all_pathes = []
+        def dfs(n, path, depth):
+            nonlocal all_pathes
+            if not n["children"] and path:
+                all_pathes.append(path)
+
+            for nn in n["children"]:
+                if depth < self._param.hierarchy:
+                    _path = deepcopy(path)
+                else:
+                    _path = path
+                _path.extend([nn["index"], *nn["texts"]])
+                dfs(nn, _path, depth+1)
+
+                if depth == self._param.hierarchy:
+                    all_pathes.append(_path)
+
+        for i in range(len(lines)):
+            print(i, lines[i])
+        dfs(root, [], 0)
+
+        if root["texts"]:
+            all_pathes.insert(0, root["texts"])
+        if from_upstream.output_format in ["markdown", "text", "html"]:
+            cks = []
+            for path in all_pathes:
+                txt = ""
+                for i in path:
+                    txt += lines[i] + "\n"
+                cks.append(txt)
+
+            self.set_output("chunks", [{"text": c} for c in cks if c])
+        else:
+            cks = []
+            images = []
+            for path in all_pathes:
+                txt = ""
+                img = None
+                for i in path:
+                    txt += lines[i] + "\n"
+                    concat_img(img, id2image(section_images[i], partial(STORAGE_IMPL.get)))
+                cks.append(txt)
+                images.append(img)
+
+            cks = [
+                {
+                    "text": RAGFlowPdfParser.remove_tag(c),
+                    "image": img,
+                    "positions": RAGFlowPdfParser.extract_positions(c),
+                }
+                for c, img in zip(cks, images)
+            ]
+            async with trio.open_nursery() as nursery:
+                for d in cks:
+                    nursery.start_soon(image2id, d, partial(STORAGE_IMPL.put), get_uuid())
+            self.set_output("chunks", cks)
+
+        self.callback(1, "Done.")
--- a/rag/flow/hierarchical_merger/schema.py
+++ b/rag/flow/hierarchical_merger/schema.py
@ -0,0 +1,37 @@
+#
+#  Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+from typing import Any, Literal
+
+from pydantic import BaseModel, ConfigDict, Field
+
+
+class HierarchicalMergerFromUpstream(BaseModel):
+    created_time: float | None = Field(default=None, alias="_created_time")
+    elapsed_time: float | None = Field(default=None, alias="_elapsed_time")
+
+    name: str
+    file: dict | None = Field(default=None)
+    chunks: list[dict[str, Any]] | None = Field(default=None)
+
+    output_format: Literal["json", "chunks"] | None = Field(default=None)
+    json_result: list[dict[str, Any]] | None = Field(default=None, alias="json")
+    markdown_result: str | None = Field(default=None, alias="markdown")
+    text_result: str | None = Field(default=None, alias="text")
+    html_result: str | None = Field(default=None, alias="html")
+
+    model_config = ConfigDict(populate_by_name=True, extra="forbid")
+
+    # def to_dict(self, *, exclude_none: bool = True) -> dict:
+    #     return self.model_dump(by_alias=True, exclude_none=exclude_none)
--- a/rag/flow/parser/parser.py
+++ b/rag/flow/parser/parser.py
@ -13,20 +13,28 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 import io
-import logging
+import json
+import os
 import random
+from functools import partial

 import trio
 import numpy as np
 from PIL import Image

 from api.db import LLMType
+from api.db.services.file2document_service import File2DocumentService
+from api.db.services.file_service import FileService
 from api.db.services.llm_service import LLMBundle
+from api.utils import get_uuid
+from api.utils.base64_image import image2id
 from deepdoc.parser import ExcelParser
 from deepdoc.parser.pdf_parser import PlainParser, RAGFlowPdfParser, VisionParser
+from rag.app.naive import Docx
 from rag.flow.base import ProcessBase, ProcessParamBase
 from rag.flow.parser.schema import ParserFromUpstream
 from rag.llm.cv_model import Base as VLM
+from rag.utils.storage_factory import STORAGE_IMPL


 class ParserParam(ProcessParamBase):
@ -45,12 +53,14 @@ class ParserParam(ProcessParamBase):
            "word": [
                "json",
            ],
-            "ppt": [],
+            "slides": [
+                "json",
+            ],
            "image": [
                "text"
            ],
-            "email": [],
-            "text": [
+            "email": ["text", "json"],
+            "text&markdown": [
                "text",
                "json"
            ],
@ -63,7 +73,6 @@ class ParserParam(ProcessParamBase):
        self.setups = {
            "pdf": {
                "parse_method": "deepdoc",  # deepdoc/plain_text/vlm
-                "llm_id": "",
                "lang": "Chinese",
                "suffix": [
                    "pdf",
@ -85,23 +94,29 @@ class ParserParam(ProcessParamBase):
                ],
                "output_format": "json",
            },
-            "markdown": {
-                "suffix": ["md", "markdown"],
+            "text&markdown": {
+                "suffix": ["md", "markdown", "mdx", "txt"],
+                "output_format": "json",
+            },
+            "slides": {
+                "suffix": [
+                    "pptx",
+                ],
                "output_format": "json",
            },
-            "ppt": {},
            "image": {
-                "parse_method": ["ocr", "vlm"],
+                "parse_method": "ocr",
                "llm_id": "",
                "lang": "Chinese",
+                "system_prompt": "",
                "suffix": ["jpg", "jpeg", "png", "gif"],
-                "output_format": "json",
+                "output_format": "text",
            },
-            "email": {},
-            "text": {
+            "email": {
                "suffix": [
-                    "txt"
+                  "eml", "msg"
                ],
+                "fields": ["from", "to", "cc", "bcc", "date", "subject", "body", "attachments", "metadata"],
                "output_format": "json",
            },
            "audio": {
@ -131,13 +146,10 @@ class ParserParam(ProcessParamBase):
        pdf_config = self.setups.get("pdf", {})
        if pdf_config:
            pdf_parse_method = pdf_config.get("parse_method", "")
-            self.check_valid_value(pdf_parse_method.lower(), "Parse method abnormal.", ["deepdoc", "plain_text", "vlm"])
+            self.check_empty(pdf_parse_method, "Parse method abnormal.")

-            if pdf_parse_method not in ["deepdoc", "plain_text"]:
-                self.check_empty(pdf_config.get("llm_id"), "VLM")
-
-            pdf_language = pdf_config.get("lang", "")
-            self.check_empty(pdf_language, "Language")
+            if pdf_parse_method.lower() not in ["deepdoc", "plain_text"]:
+                self.check_empty(pdf_config.get("lang", ""), "PDF VLM language")

            pdf_output_format = pdf_config.get("output_format", "")
            self.check_valid_value(pdf_output_format, "PDF output format abnormal.", self.allowed_output_format["pdf"])
@ -147,32 +159,38 @@ class ParserParam(ProcessParamBase):
            spreadsheet_output_format = spreadsheet_config.get("output_format", "")
            self.check_valid_value(spreadsheet_output_format, "Spreadsheet output format abnormal.", self.allowed_output_format["spreadsheet"])

-        doc_config = self.setups.get("doc", "")
+        doc_config = self.setups.get("word", "")
        if doc_config:
            doc_output_format = doc_config.get("output_format", "")
-            self.check_valid_value(doc_output_format, "Word processer document output format abnormal.", self.allowed_output_format["doc"])
+            self.check_valid_value(doc_output_format, "Word processer document output format abnormal.", self.allowed_output_format["word"])
+
+        slides_config = self.setups.get("slides", "")
+        if slides_config:
+            slides_output_format = slides_config.get("output_format", "")
+            self.check_valid_value(slides_output_format, "Slides output format abnormal.", self.allowed_output_format["slides"])

        image_config = self.setups.get("image", "")
        if image_config:
            image_parse_method = image_config.get("parse_method", "")
-            self.check_valid_value(image_parse_method.lower(), "Parse method abnormal.", ["ocr", "vlm"])
            if image_parse_method not in ["ocr"]:
-                self.check_empty(image_config.get("llm_id"), "VLM")
+                self.check_empty(image_config.get("lang", ""), "Image VLM language")

-            image_language = image_config.get("lang", "")
-            self.check_empty(image_language, "Language")
-
-        text_config = self.setups.get("text", "")
+        text_config = self.setups.get("text&markdown", "")
        if text_config:
            text_output_format = text_config.get("output_format", "")
-            self.check_valid_value(text_output_format, "Text output format abnormal.", self.allowed_output_format["text"])
+            self.check_valid_value(text_output_format, "Text output format abnormal.", self.allowed_output_format["text&markdown"])

        audio_config = self.setups.get("audio", "")
        if audio_config:
-            self.check_empty(audio_config.get("llm_id"), "VLM")
+            self.check_empty(audio_config.get("llm_id"), "Audio VLM")
            audio_language = audio_config.get("lang", "")
            self.check_empty(audio_language, "Language")

+        email_config = self.setups.get("email", "")
+        if email_config:
+            email_output_format = email_config.get("output_format", "")
+            self.check_valid_value(email_output_format, "Email output format abnormal.", self.allowed_output_format["email"])
+
    def get_input_form(self) -> dict[str, dict]:
        return {}

@ -180,21 +198,18 @@ class ParserParam(ProcessParamBase):
 class Parser(ProcessBase):
    component_name = "Parser"

-    def _pdf(self, from_upstream: ParserFromUpstream):
+    def _pdf(self, name, blob):
        self.callback(random.randint(1, 5) / 100.0, "Start to work on a PDF.")
-
-        blob = from_upstream.blob
        conf = self._param.setups["pdf"]
        self.set_output("output_format", conf["output_format"])

-        if conf.get("parse_method") == "deepdoc":
+        if conf.get("parse_method").lower() == "deepdoc":
            bboxes = RAGFlowPdfParser().parse_into_bboxes(blob, callback=self.callback)
-        elif conf.get("parse_method") == "plain_text":
+        elif conf.get("parse_method").lower() == "plain_text":
            lines, _ = PlainParser()(blob)
            bboxes = [{"text": t} for t, _ in lines]
        else:
-            assert conf.get("llm_id")
-            vision_model = LLMBundle(self._canvas._tenant_id, LLMType.IMAGE2TEXT, llm_name=conf.get("llm_id"), lang=self._param.setups["pdf"].get("lang"))
+            vision_model = LLMBundle(self._canvas._tenant_id, LLMType.IMAGE2TEXT, llm_name=conf.get("parse_method"), lang=self._param.setups["pdf"].get("lang"))
            lines, _ = VisionParser(vision_model=vision_model)(blob, callback=self.callback)
            bboxes = []
            for t, poss in lines:
@ -214,66 +229,63 @@ class Parser(ProcessBase):
                mkdn += b.get("text", "") + "\n"
            self.set_output("markdown", mkdn)

-    def _spreadsheet(self, from_upstream: ParserFromUpstream):
+    def _spreadsheet(self, name, blob):
        self.callback(random.randint(1, 5) / 100.0, "Start to work on a Spreadsheet.")
-
-        blob = from_upstream.blob
        conf = self._param.setups["spreadsheet"]
        self.set_output("output_format", conf["output_format"])
-
-        print("spreadsheet {conf=}", flush=True)
        spreadsheet_parser = ExcelParser()
        if conf.get("output_format") == "html":
-            html = spreadsheet_parser.html(blob, 1000000000)
-            self.set_output("html", html)
+            htmls = spreadsheet_parser.html(blob, 1000000000)
+            self.set_output("html", htmls[0])
        elif conf.get("output_format") == "json":
            self.set_output("json", [{"text": txt} for txt in spreadsheet_parser(blob) if txt])
        elif conf.get("output_format") == "markdown":
            self.set_output("markdown", spreadsheet_parser.markdown(blob))

-    def _word(self, from_upstream: ParserFromUpstream):
-        from tika import parser as  word_parser
-
+    def _word(self, name, blob):
        self.callback(random.randint(1, 5) / 100.0, "Start to work on a Word Processor Document")
-
-        blob = from_upstream.blob
-        name = from_upstream.name
        conf = self._param.setups["word"]
        self.set_output("output_format", conf["output_format"])
-
-        print("word {conf=}", flush=True)
-        doc_parsed = word_parser.from_buffer(blob)
-
-        sections = []
-        if doc_parsed.get("content"):
-            sections = doc_parsed["content"].split("\n")
-            sections = [{"text": section} for section in sections if section]
-        else:
-            logging.warning(f"tika.parser got empty content from {name}.")
-
+        docx_parser = Docx()
+        sections, tbls = docx_parser(name, binary=blob)
+        sections = [{"text": section[0], "image": section[1]} for section in sections if section]
+        sections.extend([{"text": tb, "image": None} for ((_,tb), _) in tbls])
        # json
        assert conf.get("output_format") == "json", "have to be json for doc"
        if conf.get("output_format") == "json":
            self.set_output("json", sections)

-    def _markdown(self, from_upstream: ParserFromUpstream):
+    def _slides(self, name, blob):
+        from deepdoc.parser.ppt_parser import RAGFlowPptParser as ppt_parser
+
+        self.callback(random.randint(1, 5) / 100.0, "Start to work on a PowerPoint Document")
+
+        conf = self._param.setups["slides"]
+        self.set_output("output_format", conf["output_format"])
+
+        ppt_parser = ppt_parser()
+        txts = ppt_parser(blob, 0, 100000, None)
+
+        sections = [{"text": section} for section in txts if section.strip()]
+
+        # json
+        assert conf.get("output_format") == "json", "have to be json for ppt"
+        if conf.get("output_format") == "json":
+            self.set_output("json", sections)
+
+    def _markdown(self, name, blob):
        from functools import reduce

        from rag.app.naive import Markdown as naive_markdown_parser
        from rag.nlp import concat_img

        self.callback(random.randint(1, 5) / 100.0, "Start to work on a markdown.")
-
-        blob = from_upstream.blob
-        name = from_upstream.name
-        conf = self._param.setups["markdown"]
+        conf = self._param.setups["text&markdown"]
        self.set_output("output_format", conf["output_format"])

        markdown_parser = naive_markdown_parser()
        sections, tables = markdown_parser(name, blob, separate_tables=False)

-        # json
-        assert conf.get("output_format") == "json", "have to be json for doc"
        if conf.get("output_format") == "json":
            json_results = []

@ -291,69 +303,51 @@ class Parser(ProcessBase):
                json_results.append(json_result)

            self.set_output("json", json_results)
-
-    def _text(self, from_upstream: ParserFromUpstream):
-        from deepdoc.parser.utils import get_text
-
-        self.callback(random.randint(1, 5) / 100.0, "Start to work on a text.")
-
-        blob = from_upstream.blob
-        name = from_upstream.name
-        conf = self._param.setups["text"]
-        self.set_output("output_format", conf["output_format"])
-
-        # parse binary to text
-        text_content = get_text(name, binary=blob)
-
-        if conf.get("output_format") == "json":
-            result = [{"text": text_content}]
-            self.set_output("json", result)
        else:
-            result = text_content
-            self.set_output("text", result)
+            self.set_output("text", "\n".join([section_text for section_text, _ in sections]))

-    def _image(self, from_upstream: ParserFromUpstream):
+
+    def _image(self, name, blob):
        from deepdoc.vision import OCR

        self.callback(random.randint(1, 5) / 100.0, "Start to work on an image.")
-
-        blob = from_upstream.blob
        conf = self._param.setups["image"]
        self.set_output("output_format", conf["output_format"])

        img = Image.open(io.BytesIO(blob)).convert("RGB")
-        lang = conf["lang"]

        if conf["parse_method"] == "ocr":
            # use ocr, recognize chars only
            ocr = OCR()
            bxs = ocr(np.array(img))  # return boxes and recognize result
            txt = "\n".join([t[0] for _, t in bxs if t[0]])
-
        else:
+            lang = conf["lang"]
            # use VLM to describe the picture
-            cv_model = LLMBundle(self._canvas.get_tenant_id(), LLMType.IMAGE2TEXT, llm_name=conf["llm_id"],lang=lang)
+            cv_model = LLMBundle(self._canvas.get_tenant_id(), LLMType.IMAGE2TEXT, llm_name=conf["parse_method"], lang=lang)
            img_binary = io.BytesIO()
            img.save(img_binary, format="JPEG")
            img_binary.seek(0)
-            txt = cv_model.describe(img_binary.read())
+
+            system_prompt = conf.get("system_prompt")
+            if system_prompt:
+                txt = cv_model.describe_with_prompt(img_binary.read(), system_prompt)
+            else:
+                txt = cv_model.describe(img_binary.read())

        self.set_output("text", txt)

-    def _audio(self, from_upstream: ParserFromUpstream):
+    def _audio(self, name, blob):
        import os
        import tempfile

        self.callback(random.randint(1, 5) / 100.0, "Start to work on an audio.")

-        blob = from_upstream.blob
-        name = from_upstream.name
        conf = self._param.setups["audio"]
        self.set_output("output_format", conf["output_format"])

        lang = conf["lang"]
        _, ext = os.path.splitext(name)
-        tmp_path = ""
        with tempfile.NamedTemporaryFile(suffix=ext) as tmpf:
            tmpf.write(blob)
            tmpf.flush()
@ -364,15 +358,131 @@ class Parser(ProcessBase):

            self.set_output("text", txt)

+    def _email(self, name, blob):
+        self.callback(random.randint(1, 5) / 100.0, "Start to work on an email.")
+
+        email_content = {}
+        conf = self._param.setups["email"]
+        target_fields = conf["fields"]
+
+        _, ext = os.path.splitext(name)
+        if ext == ".eml":
+            # handle eml file
+            from email import policy
+            from email.parser import BytesParser
+
+            msg = BytesParser(policy=policy.default).parse(io.BytesIO(blob))
+            email_content['metadata'] = {}
+            # handle header info
+            for header, value in msg.items():
+                # get fields like from, to, cc, bcc, date, subject
+                if header.lower() in target_fields:
+                    email_content[header.lower()] = value
+                # get metadata
+                elif header.lower() not in ["from", "to", "cc", "bcc", "date", "subject"]:
+                    email_content["metadata"][header.lower()] = value
+            # get body
+            if "body" in target_fields:
+                body_text, body_html = [], []
+                def _add_content(m, content_type):
+                    if content_type == "text/plain":
+                        body_text.append(
+                            m.get_payload(decode=True).decode(m.get_content_charset())
+                        )
+                    elif content_type == "text/html":
+                        body_html.append(
+                            m.get_payload(decode=True).decode(m.get_content_charset())
+                        )
+                    elif "multipart" in content_type:
+                        if m.is_multipart():
+                            for part in m.iter_parts():
+                                _add_content(part, part.get_content_type())
+
+                _add_content(msg, msg.get_content_type())
+
+                email_content["text"] = body_text
+                email_content["text_html"] = body_html
+            # get attachment
+            if "attachments" in target_fields:
+                attachments = []
+                for part in msg.iter_attachments():
+                    content_disposition = part.get("Content-Disposition")
+                    if content_disposition:
+                        dispositions = content_disposition.strip().split(";")
+                        if dispositions[0].lower() == "attachment":
+                            filename = part.get_filename()
+                            payload = part.get_payload(decode=True)
+                            attachments.append({
+                                "filename": filename,
+                                "payload": payload,
+                            })
+                email_content["attachments"] = attachments
+        else:
+            # handle msg file
+            import extract_msg
+            print("handle a msg file.")
+            msg = extract_msg.Message(blob)
+            # handle header info
+            basic_content = {
+                "from": msg.sender,
+                "to": msg.to,
+                "cc": msg.cc,
+                "bcc": msg.bcc,
+                "date": msg.date,
+                "subject": msg.subject,
+            }
+            email_content.update({k: v for k, v in basic_content.items() if k in target_fields})
+            # get metadata
+            email_content['metadata'] = {
+                'message_id': msg.messageId,
+                'in_reply_to': msg.inReplyTo,
+            }
+            # get body
+            if "body" in target_fields:
+                email_content["text"] = msg.body  # usually empty. try text_html instead
+                email_content["text_html"] = msg.htmlBody
+            # get attachments
+            if "attachments" in target_fields:
+                attachments = []
+                for t in msg.attachments:
+                    attachments.append({
+                        "filename": t.name,
+                        "payload": t.data  # binary
+                    })
+                email_content["attachments"] = attachments
+
+        if conf["output_format"] == "json":
+            self.set_output("json", [email_content])
+        else:
+            content_txt = ''
+            for k, v in email_content.items():
+                if isinstance(v, str):
+                    # basic info
+                    content_txt += f'{k}:{v}' + "\n"
+                elif isinstance(v, dict):
+                    # metadata
+                    content_txt += f'{k}:{json.dumps(v)}' + "\n"
+                elif isinstance(v, list):
+                    # attachments or others
+                    for fb in v:
+                        if isinstance(fb, dict):
+                            # attachments
+                            content_txt += f'{fb["filename"]}:{fb["payload"]}' + "\n"
+                        else:
+                            # str, usually plain text
+                            content_txt += fb
+            self.set_output("text", content_txt)
+
    async def _invoke(self, **kwargs):
        function_map = {
            "pdf": self._pdf,
-            "markdown": self._markdown,
+            "text&markdown": self._markdown,
            "spreadsheet": self._spreadsheet,
+            "slides": self._slides,
            "word": self._word,
-            "text": self._text,
            "image": self._image,
            "audio": self._audio,
+            "email": self._email,
        }
        try:
            from_upstream = ParserFromUpstream.model_validate(kwargs)
@ -380,8 +490,25 @@ class Parser(ProcessBase):
            self.set_output("_ERROR", f"Input error: {str(e)}")
            return

+        name = from_upstream.name
+        if self._canvas._doc_id:
+            b, n = File2DocumentService.get_storage_address(doc_id=self._canvas._doc_id)
+            blob = STORAGE_IMPL.get(b, n)
+        else:
+            blob = FileService.get_blob(from_upstream.file["created_by"], from_upstream.file["id"])
+
+        done = False
        for p_type, conf in self._param.setups.items():
            if from_upstream.name.split(".")[-1].lower() not in conf.get("suffix", []):
                continue
-            await trio.to_thread.run_sync(function_map[p_type], from_upstream)
+            await trio.to_thread.run_sync(function_map[p_type], name, blob)
+            done = True
            break
+
+        if not done:
+            raise Exception("No suitable for file extension: `.%s`" % from_upstream.name.split(".")[-1].lower())
+
+        outs = self.output()
+        async with trio.open_nursery() as nursery:
+            for d in outs.get("json", []):
+                nursery.start_soon(image2id, d, partial(STORAGE_IMPL.put), get_uuid())
--- a/rag/flow/parser/schema.py
+++ b/rag/flow/parser/schema.py
@ -20,6 +20,5 @@ class ParserFromUpstream(BaseModel):
    elapsed_time: float | None = Field(default=None, alias="_elapsed_time")

    name: str
-    blob: bytes
-
+    file: dict | None = Field(default=None)
    model_config = ConfigDict(populate_by_name=True, extra="forbid")
--- a/rag/flow/pipeline.py
+++ b/rag/flow/pipeline.py
@ -17,41 +17,92 @@ import datetime
 import json
 import logging
 import random
-import time
-
+from timeit import default_timer as timer
 import trio
-
 from agent.canvas import Graph
 from api.db.services.document_service import DocumentService
+from api.db.services.task_service import has_canceled, TaskService, CANVAS_DEBUG_DOC_ID
 from rag.utils.redis_conn import REDIS_CONN


 class Pipeline(Graph):
-    def __init__(self, dsl: str, tenant_id=None, doc_id=None, task_id=None, flow_id=None):
+    def __init__(self, dsl: str|dict, tenant_id=None, doc_id=None, task_id=None, flow_id=None):
+        if isinstance(dsl, dict):
+            dsl = json.dumps(dsl, ensure_ascii=False)
        super().__init__(dsl, tenant_id, task_id)
+        if doc_id == CANVAS_DEBUG_DOC_ID:
+            doc_id = None
        self._doc_id = doc_id
        self._flow_id = flow_id
        self._kb_id = None
-        if doc_id:
+        if self._doc_id:
            self._kb_id = DocumentService.get_knowledgebase_id(doc_id)
-            assert self._kb_id, f"Can't find KB of this document: {doc_id}"
+            if not self._kb_id:
+                self._doc_id = None

    def callback(self, component_name: str, progress: float | int | None = None, message: str = "") -> None:
+        from rag.svr.task_executor import TaskCanceledException
        log_key = f"{self._flow_id}-{self.task_id}-logs"
+        timestamp = timer()
+        if has_canceled(self.task_id):
+            progress = -1
+            message += "[CANCEL]"
        try:
            bin = REDIS_CONN.get(log_key)
            obj = json.loads(bin.encode("utf-8"))
            if obj:
-                if obj[-1]["component_name"] == component_name:
-                    obj[-1]["trace"].append({"progress": progress, "message": message, "datetime": datetime.datetime.now().strftime("%H:%M:%S")})
+                if obj[-1]["component_id"] == component_name:
+                    obj[-1]["trace"].append(
+                        {
+                            "progress": progress,
+                            "message": message,
+                            "datetime": datetime.datetime.now().strftime("%H:%M:%S"),
+                            "timestamp": timestamp,
+                            "elapsed_time": timestamp - obj[-1]["trace"][-1]["timestamp"],
+                        }
+                    )
                else:
-                    obj.append({"component_name": component_name, "trace": [{"progress": progress, "message": message, "datetime": datetime.datetime.now().strftime("%H:%M:%S")}]})
+                    obj.append(
+                        {
+                            "component_id": component_name,
+                            "trace": [{"progress": progress, "message": message, "datetime": datetime.datetime.now().strftime("%H:%M:%S"), "timestamp": timestamp, "elapsed_time": 0}],
+                        }
+                    )
            else:
-                obj = [{"component_name": component_name, "trace": [{"progress": progress, "message": message, "datetime": datetime.datetime.now().strftime("%H:%M:%S")}]}]
-            REDIS_CONN.set_obj(log_key, obj, 60 * 10)
+                obj = [
+                    {
+                        "component_id": component_name,
+                        "trace": [{"progress": progress, "message": message, "datetime": datetime.datetime.now().strftime("%H:%M:%S"), "timestamp": timestamp, "elapsed_time": 0}],
+                    }
+                ]
+            if component_name != "END" and self._doc_id and self.task_id:
+                percentage = 1.0 / len(self.components.items())
+                finished = 0.0
+                for o in obj:
+                    for t in o["trace"]:
+                        if t["progress"] < 0:
+                            finished = -1
+                            break
+                    if finished < 0:
+                        break
+                    finished += o["trace"][-1]["progress"] * percentage
+
+                msg = ""
+                if len(obj[-1]["trace"]) == 1:
+                    msg += f"\n-------------------------------------\n[{self.get_component_name(o['component_id'])}]:\n"
+                t = obj[-1]["trace"][-1]
+                msg += "%s: %s\n" % (t["datetime"], t["message"])
+                TaskService.update_progress(self.task_id, {"progress": finished, "progress_msg": msg})
+            elif component_name == "END" and not self._doc_id:
+                obj[-1]["trace"][-1]["dsl"] = json.loads(str(self))
+            REDIS_CONN.set_obj(log_key, obj, 60 * 30)
+
        except Exception as e:
            logging.exception(e)

+        if has_canceled(self.task_id):
+            raise TaskCanceledException(message)
+
    def fetch_logs(self):
        log_key = f"{self._flow_id}-{self.task_id}-logs"
        try:
@ -62,34 +113,32 @@ class Pipeline(Graph):
            logging.exception(e)
        return []

-    def reset(self):
-        super().reset()
+
+    async def run(self, **kwargs):
        log_key = f"{self._flow_id}-{self.task_id}-logs"
        try:
            REDIS_CONN.set_obj(log_key, [], 60 * 10)
        except Exception as e:
            logging.exception(e)
-
-    async def run(self, **kwargs):
-        st = time.perf_counter()
+        self.error = ""
        if not self.path:
            self.path.append("File")
-
-        if self._doc_id:
-            DocumentService.update_by_id(
-                self._doc_id, {"progress": random.randint(0, 5) / 100.0, "progress_msg": "Start the pipeline...", "process_begin_at": datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")}
-            )
-
-        self.error = ""
-        idx = len(self.path) - 1
-        if idx == 0:
            cpn_obj = self.get_component_obj(self.path[0])
            await cpn_obj.invoke(**kwargs)
            if cpn_obj.error():
                self.error = "[ERROR]" + cpn_obj.error()
-            else:
-                idx += 1
-                self.path.extend(cpn_obj.get_downstream())
+                self.callback(cpn_obj.component_name, -1, self.error)
+
+        if self._doc_id:
+            TaskService.update_progress(self.task_id, {
+                "progress": random.randint(0, 5) / 100.0,
+                "progress_msg": "Start the pipeline...",
+                "begin_at": datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")})
+
+        idx = len(self.path) - 1
+        cpn_obj = self.get_component_obj(self.path[idx])
+        idx += 1
+        self.path.extend(cpn_obj.get_downstream())

        while idx < len(self.path) and not self.error:
            last_cpn = self.get_component_obj(self.path[idx - 1])
@ -98,15 +147,28 @@ class Pipeline(Graph):
            async def invoke():
                nonlocal last_cpn, cpn_obj
                await cpn_obj.invoke(**last_cpn.output())
+                #if inspect.iscoroutinefunction(cpn_obj.invoke):
+                #    await cpn_obj.invoke(**last_cpn.output())
+                #else:
+                #    cpn_obj.invoke(**last_cpn.output())

            async with trio.open_nursery() as nursery:
                nursery.start_soon(invoke)
+
            if cpn_obj.error():
                self.error = "[ERROR]" + cpn_obj.error()
-                self.callback(cpn_obj.component_name, -1, self.error)
+                self.callback(cpn_obj._id, -1, self.error)
                break
            idx += 1
            self.path.extend(cpn_obj.get_downstream())

-        if self._doc_id:
-            DocumentService.update_by_id(self._doc_id, {"progress": 1 if not self.error else -1, "progress_msg": "Pipeline finished...\n" + self.error, "process_duration": time.perf_counter() - st})
+        self.callback("END", 1 if not self.error else -1, json.dumps(self.get_component_obj(self.path[-1]).output(), ensure_ascii=False))
+
+        if not self.error:
+            return self.get_component_obj(self.path[-1]).output()
+
+        TaskService.update_progress(self.task_id, {
+            "progress": -1,
+            "progress_msg": f"[ERROR]: {self.error}"})
+
+        return {}
--- a/rag/flow/splitter/init.py
+++ b/rag/flow/splitter/init.py
@ -0,0 +1,15 @@
+#
+#  Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
--- a/rag/flow/splitter/schema.py
+++ b/rag/flow/splitter/schema.py
@ -17,19 +17,20 @@ from typing import Any, Literal
 from pydantic import BaseModel, ConfigDict, Field


-class ChunkerFromUpstream(BaseModel):
+class SplitterFromUpstream(BaseModel):
    created_time: float | None = Field(default=None, alias="_created_time")
    elapsed_time: float | None = Field(default=None, alias="_elapsed_time")

    name: str
-    blob: bytes
+    file: dict | None = Field(default=None)
+    chunks: list[dict[str, Any]] | None = Field(default=None)

    output_format: Literal["json", "markdown", "text", "html"] | None = Field(default=None)

    json_result: list[dict[str, Any]] | None = Field(default=None, alias="json")
    markdown_result: str | None = Field(default=None, alias="markdown")
    text_result: str | None = Field(default=None, alias="text")
-    html_result: list[str] | None = Field(default=None, alias="html")
+    html_result: str | None = Field(default=None, alias="html")

    model_config = ConfigDict(populate_by_name=True, extra="forbid")

--- a/rag/flow/splitter/splitter.py
+++ b/rag/flow/splitter/splitter.py
@ -0,0 +1,111 @@
+#
+#  Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+import random
+from functools import partial
+
+import trio
+
+from api.utils import get_uuid
+from api.utils.base64_image import id2image, image2id
+from deepdoc.parser.pdf_parser import RAGFlowPdfParser
+from rag.flow.base import ProcessBase, ProcessParamBase
+from rag.flow.splitter.schema import SplitterFromUpstream
+from rag.nlp import naive_merge, naive_merge_with_images
+from rag.utils.storage_factory import STORAGE_IMPL
+
+
+class SplitterParam(ProcessParamBase):
+    def __init__(self):
+        super().__init__()
+        self.chunk_token_size = 512
+        self.delimiters = ["\n"]
+        self.overlapped_percent = 0
+
+    def check(self):
+        self.check_empty(self.delimiters, "Delimiters.")
+        self.check_positive_integer(self.chunk_token_size, "Chunk token size.")
+        self.check_decimal_float(self.overlapped_percent, "Overlapped percentage: [0, 1)")
+
+    def get_input_form(self) -> dict[str, dict]:
+        return {}
+
+
+class Splitter(ProcessBase):
+    component_name = "Splitter"
+
+    async def _invoke(self, **kwargs):
+        try:
+            from_upstream = SplitterFromUpstream.model_validate(kwargs)
+        except Exception as e:
+            self.set_output("_ERROR", f"Input error: {str(e)}")
+            return
+
+        deli = ""
+        for d in self._param.delimiters:
+            if len(d) > 1:
+                deli += f"`{d}`"
+            else:
+                deli += d
+
+        self.set_output("output_format", "chunks")
+        self.callback(random.randint(1, 5) / 100.0, "Start to split into chunks.")
+        if from_upstream.output_format in ["markdown", "text", "html"]:
+            if from_upstream.output_format == "markdown":
+                payload = from_upstream.markdown_result
+            elif from_upstream.output_format == "text":
+                payload = from_upstream.text_result
+            else:  # == "html"
+                payload = from_upstream.html_result
+
+            if not payload:
+                payload = ""
+
+            cks = naive_merge(
+                payload,
+                self._param.chunk_token_size,
+                deli,
+                self._param.overlapped_percent,
+            )
+            self.set_output("chunks", [{"text": c.strip()} for c in cks if c.strip()])
+
+            self.callback(1, "Done.")
+            return
+
+        # json
+        sections, section_images = [], []
+        for o in from_upstream.json_result or []:
+            sections.append((o.get("text", ""), o.get("position_tag", "")))
+            section_images.append(id2image(o.get("img_id"), partial(STORAGE_IMPL.get)))
+
+        chunks, images = naive_merge_with_images(
+            sections,
+            section_images,
+            self._param.chunk_token_size,
+            deli,
+            self._param.overlapped_percent,
+        )
+        cks = [
+            {
+                "text": RAGFlowPdfParser.remove_tag(c),
+                "image": img,
+                "positions": [[pos[0][-1]+1, *pos[1:]] for pos in RAGFlowPdfParser.extract_positions(c)],
+            }
+            for c, img in zip(chunks, images) if c.strip()
+        ]
+        async with trio.open_nursery() as nursery:
+            for d in cks:
+                nursery.start_soon(image2id, d, partial(STORAGE_IMPL.put), get_uuid())
+        self.set_output("chunks",  cks)
+        self.callback(1, "Done.")
--- a/rag/flow/tests/client.py
+++ b/rag/flow/tests/client.py
@ -30,7 +30,7 @@ def print_logs(pipeline: Pipeline):
    while True:
        time.sleep(5)
        logs = pipeline.fetch_logs()
-        logs_str = json.dumps(logs)
+        logs_str = json.dumps(logs, ensure_ascii=False)
        if logs_str != last_logs:
            print(logs_str)
        last_logs = logs_str
--- a/rag/flow/tests/dsl_examples/general_pdf_all.json
+++ b/rag/flow/tests/dsl_examples/general_pdf_all.json
@ -38,6 +38,13 @@
                  ],
                  "output_format": "json"
                },
+                "slides": {
+                    "parse_method": "presentation",
+                    "suffix": [
+                        "pptx"
+                    ],
+                    "output_format": "json"
+                },
                "markdown": {
                  "suffix": [
                    "md",
@ -82,19 +89,36 @@
                  "lang": "Chinese",
                  "llm_id": "SenseVoiceSmall",
                  "output_format": "json"
+                },
+                "email": {
+                  "suffix": [
+                    "msg"
+                  ],
+                  "fields": [
+                    "from",
+                    "to",
+                    "cc",
+                    "bcc",
+                    "date",
+                    "subject",
+                    "body",
+                    "attachments"
+                  ],
+                  "output_format": "json"
                }
              }
          }
        },
-        "downstream": ["Chunker:0"],
+        "downstream": ["Splitter:0"],
        "upstream": ["Begin"]
    },
-    "Chunker:0": {
+    "Splitter:0": {
        "obj": {
-            "component_name": "Chunker",
+            "component_name": "Splitter",
            "params": {
-              "method": "general",
-              "auto_keywords": 5
+              "chunk_token_size": 512,
+              "delimiters": ["\n"],
+              "overlapped_percent": 0
            }
        },
        "downstream": ["Tokenizer:0"],
--- a/rag/flow/tests/dsl_examples/hierarchical_merger.json
+++ b/rag/flow/tests/dsl_examples/hierarchical_merger.json
@ -0,0 +1,84 @@
+{
+  "components": {
+    "File": {
+        "obj":{
+            "component_name": "File",
+            "params": {
+            }
+        },
+        "downstream": ["Parser:0"],
+        "upstream": []
+    },
+    "Parser:0": {
+        "obj": {
+            "component_name": "Parser",
+            "params": {
+              "setups": {
+                "pdf": {
+                  "parse_method": "deepdoc",
+                  "vlm_name": "",
+                  "lang": "Chinese",
+                  "suffix": [
+                    "pdf"
+                  ],
+                  "output_format": "json"
+                },
+                "spreadsheet": {
+                  "suffix": [
+                    "xls",
+                    "xlsx",
+                    "csv"
+                  ],
+                  "output_format": "html"
+                },
+                "word": {
+                  "suffix": [
+                    "doc",
+                    "docx"
+                  ],
+                  "output_format": "json"
+                },
+                "markdown": {
+                  "suffix": [
+                    "md",
+                    "markdown"
+                  ],
+                  "output_format": "text"
+                },
+                "text": {
+                  "suffix": ["txt"],
+                  "output_format": "json"
+                }
+              }
+          }
+        },
+        "downstream": ["Splitter:0"],
+        "upstream": ["File"]
+    },
+    "Splitter:0": {
+        "obj": {
+            "component_name": "Splitter",
+            "params": {
+              "chunk_token_size": 512,
+              "delimiters": ["\r\n"],
+              "overlapped_percent": 0
+            }
+        },
+        "downstream": ["HierarchicalMerger:0"],
+        "upstream": ["Parser:0"]
+    },
+    "HierarchicalMerger:0": {
+        "obj": {
+            "component_name": "HierarchicalMerger",
+            "params": {
+              "levels": [["^#[^#]"], ["^##[^#]"], ["^###[^#]"], ["^####[^#]"]],
+              "hierarchy": 2
+            }
+        },
+        "downstream": [],
+        "upstream": ["Splitter:0"]
+    }
+  },
+  "path": []
+}
+
--- a/rag/flow/tokenizer/schema.py
+++ b/rag/flow/tokenizer/schema.py
@ -22,16 +22,16 @@ class TokenizerFromUpstream(BaseModel):
    elapsed_time: float | None = Field(default=None, alias="_elapsed_time")

    name: str = ""
-    blob: bytes
+    file: dict | None = Field(default=None)

-    output_format: Literal["json", "markdown", "text", "html"] | None = Field(default=None)
+    output_format: Literal["json", "markdown", "text", "html", "chunks"] | None = Field(default=None)

    chunks: list[dict[str, Any]] | None = Field(default=None)

    json_result: list[dict[str, Any]] | None = Field(default=None, alias="json")
    markdown_result: str | None = Field(default=None, alias="markdown")
    text_result: str | None = Field(default=None, alias="text")
-    html_result: list[str] | None = Field(default=None, alias="html")
+    html_result: str | None = Field(default=None, alias="html")

    model_config = ConfigDict(populate_by_name=True, extra="forbid")

@ -40,12 +40,14 @@ class TokenizerFromUpstream(BaseModel):
        if self.chunks:
            return self

-        if self.output_format in {"markdown", "text"}:
+        if self.output_format in {"markdown", "text", "html"}:
            if self.output_format == "markdown" and not self.markdown_result:
                raise ValueError("output_format=markdown requires a markdown payload (field: 'markdown' or 'markdown_result').")
            if self.output_format == "text" and not self.text_result:
                raise ValueError("output_format=text requires a text payload (field: 'text' or 'text_result').")
+            if self.output_format == "html" and not self.html_result:
+                raise ValueError("output_format=text requires a html payload (field: 'html' or 'html_result').")
        else:
-            if not self.json_result:
+            if not self.json_result and not self.chunks:
                raise ValueError("When no chunks are provided and output_format is not markdown/text, a JSON list payload is required (field: 'json' or 'json_result').")
        return self
--- a/rag/flow/tokenizer/tokenizer.py
+++ b/rag/flow/tokenizer/tokenizer.py
@ -37,6 +37,7 @@ class TokenizerParam(ProcessParamBase):
        super().__init__()
        self.search_method = ["full_text", "embedding"]
        self.filename_embd_weight = 0.1
+        self.fields = ["text"]

    def check(self):
        for v in self.search_method:
@ -61,10 +62,14 @@ class Tokenizer(ProcessBase):
        embedding_model = LLMBundle(self._canvas._tenant_id, LLMType.EMBEDDING, llm_name=embedding_id)
        texts = []
        for c in chunks:
-            if c.get("questions"):
-                texts.append("\n".join(c["questions"]))
-            else:
-                texts.append(re.sub(r"</?(table|td|caption|tr|th)( [^<>]{0,12})?>", " ", c["text"]))
+            txt = ""
+            for f in self._param.fields:
+                f = c.get(f)
+                if isinstance(f, str):
+                    txt += f
+                elif isinstance(f, list):
+                    txt += "\n".join(f)
+            texts.append(re.sub(r"</?(table|td|caption|tr|th)( [^<>]{0,12})?>", " ", txt))
        vts, c = embedding_model.encode([name])
        token_count += c
        tts = np.concatenate([vts[0] for _ in range(len(texts))], axis=0)
@ -103,26 +108,36 @@ class Tokenizer(ProcessBase):
            self.set_output("_ERROR", f"Input error: {str(e)}")
            return

+        self.set_output("output_format", "chunks")
        parts = sum(["full_text" in self._param.search_method, "embedding" in self._param.search_method])
        if "full_text" in self._param.search_method:
            self.callback(random.randint(1, 5) / 100.0, "Start to tokenize.")
            if from_upstream.chunks:
                chunks = from_upstream.chunks
                for i, ck in enumerate(chunks):
+                    ck["title_tks"] = rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", from_upstream.name))
+                    ck["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(ck["title_tks"])
                    if ck.get("questions"):
-                        ck["question_tks"] = rag_tokenizer.tokenize("\n".join(ck["questions"]))
+                        ck["question_kwd"] = ck["questions"].split("\n")
+                        ck["question_tks"] = rag_tokenizer.tokenize(str(ck["questions"]))
                    if ck.get("keywords"):
-                        ck["important_tks"] = rag_tokenizer.tokenize("\n".join(ck["keywords"]))
-                    ck["content_ltks"] = rag_tokenizer.tokenize(ck["text"])
-                    ck["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(ck["content_ltks"])
+                        ck["important_kwd"] = ck["keywords"].split(",")
+                        ck["important_tks"] = rag_tokenizer.tokenize(str(ck["keywords"]))
+                    if ck.get("summary"):
+                        ck["content_ltks"] = rag_tokenizer.tokenize(str(ck["summary"]))
+                        ck["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(ck["content_ltks"])
+                    else:
+                        ck["content_ltks"] = rag_tokenizer.tokenize(ck["text"])
+                        ck["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(ck["content_ltks"])
                    if i % 100 == 99:
                        self.callback(i * 1.0 / len(chunks) / parts)
+
            elif from_upstream.output_format in ["markdown", "text", "html"]:
                if from_upstream.output_format == "markdown":
                    payload = from_upstream.markdown_result
                elif from_upstream.output_format == "text":
                    payload = from_upstream.text_result
-                else:  # == "html"
+                else:
                    payload = from_upstream.html_result

                if not payload:
@ -130,12 +145,16 @@ class Tokenizer(ProcessBase):

                ck = {"text": payload}
                if "full_text" in self._param.search_method:
-                    ck["content_ltks"] = rag_tokenizer.tokenize(kwargs.get(kwargs["output_format"], ""))
+                    ck["title_tks"] = rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", from_upstream.name))
+                    ck["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(ck["title_tks"])
+                    ck["content_ltks"] = rag_tokenizer.tokenize(payload)
                    ck["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(ck["content_ltks"])
                chunks = [ck]
            else:
                chunks = from_upstream.json_result
                for i, ck in enumerate(chunks):
+                    ck["title_tks"] = rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", from_upstream.name))
+                    ck["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(ck["title_tks"])
                    ck["content_ltks"] = rag_tokenizer.tokenize(ck["text"])
                    ck["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(ck["content_ltks"])
                    if i % 100 == 99: