Feat: add splitter (#10161)

### What problem does this PR solve? ### Type of change - [x] New Feature (non-breaking change which adds functionality) --------- Signed-off-by: dependabot[bot] <support@github.com> Co-authored-by: Lynn <lynn_inf@hotmail.com> Co-authored-by: chanx <1243304602@qq.com> Co-authored-by: balibabu <cike8899@users.noreply.github.com> Co-authored-by: 纷繁下的无奈 <zhileihuang@126.com> Co-authored-by: huangzl <huangzl@shinemo.com> Co-authored-by: writinwaters <93570324+writinwaters@users.noreply.github.com> Co-authored-by: Wilmer <33392318@qq.com> Co-authored-by: Adrian Weidig <adrianweidig@gmx.net> Co-authored-by: Zhichang Yu <yuzhichang@gmail.com> Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> Co-authored-by: Yongteng Lei <yongtengrey@outlook.com> Co-authored-by: Liu An <asiro@qq.com> Co-authored-by: buua436 <66937541+buua436@users.noreply.github.com> Co-authored-by: BadwomanCraZY <511528396@qq.com> Co-authored-by: cucusenok <31804608+cucusenok@users.noreply.github.com> Co-authored-by: Russell Valentine <russ@coldstonelabs.org> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: Billy Bao <newyorkupperbay@gmail.com> Co-authored-by: Zhedong Cen <cenzhedong2@126.com> Co-authored-by: TensorNull <129579691+TensorNull@users.noreply.github.com> Co-authored-by: TensorNull <tensor.null@gmail.com>
2025-12-20 04:39:00 +08:00 · 2025-09-19 10:15:19 +08:00
parent f9c7404bee
commit a1b947ffd6
81 changed files with 3083 additions and 799 deletions
--- a/rag/flow/base.py
+++ b/rag/flow/base.py
@ -13,7 +13,6 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 #
-import logging
 import os
 import time
 from functools import partial
@ -44,17 +43,17 @@ class ProcessBase(ComponentBase):
        self.set_output("_created_time", time.perf_counter())
        for k, v in kwargs.items():
            self.set_output(k, v)
-        try:
-            with trio.fail_after(self._param.timeout):
-                await self._invoke(**kwargs)
-                self.callback(1, "Done")
-        except Exception as e:
-            if self.get_exception_default_value():
-                self.set_exception_default_value()
-            else:
-                self.set_output("_ERROR", str(e))
-            logging.exception(e)
-            self.callback(-1, str(e))
+        #try:
+        with trio.fail_after(self._param.timeout):
+            await self._invoke(**kwargs)
+            self.callback(1, "Done")
+        #except Exception as e:
+        #    if self.get_exception_default_value():
+        #        self.set_exception_default_value()
+        #    else:
+        #        self.set_output("_ERROR", str(e))
+        #    logging.exception(e)
+        #    self.callback(-1, str(e))
        self.set_output("_elapsed_time", time.perf_counter() - self.output("_created_time"))
        return self.output()

--- a/rag/flow/chunker/chunker.py
+++ b/rag/flow/chunker/chunker.py
@ -12,18 +12,19 @@
 #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
+import json
 import random
-
 import trio
-
 from api.db import LLMType
 from api.db.services.llm_service import LLMBundle
 from deepdoc.parser.pdf_parser import RAGFlowPdfParser
 from graphrag.utils import chat_limiter, get_llm_cache, set_llm_cache
 from rag.flow.base import ProcessBase, ProcessParamBase
 from rag.flow.chunker.schema import ChunkerFromUpstream
-from rag.nlp import naive_merge, naive_merge_with_images
-from rag.prompts.prompts import keyword_extraction, question_proposal
+from rag.nlp import naive_merge, naive_merge_with_images, concat_img
+from rag.prompts.prompts import keyword_extraction, question_proposal, detect_table_of_contents, \
+    table_of_contents_index, toc_transformer
+from rag.utils import num_tokens_from_string


 class ChunkerParam(ProcessParamBase):
@ -43,6 +44,7 @@ class ChunkerParam(ProcessParamBase):
            "paper",
            "laws",
            "presentation",
+            "toc" # table of contents
            # Other
            # "Tag" # TODO: Other method
        ]
@ -54,7 +56,7 @@ class ChunkerParam(ProcessParamBase):
        self.auto_keywords = 0
        self.auto_questions = 0
        self.tag_sets = []
-        self.llm_setting = {"llm_name": "", "lang": "Chinese"}
+        self.llm_setting = {"llm_id": "", "lang": "Chinese"}

    def check(self):
        self.check_valid_value(self.method.lower(), "Chunk method abnormal.", self.method_options)
@ -142,6 +144,91 @@ class Chunker(ProcessBase):
    def _one(self, from_upstream: ChunkerFromUpstream):
        pass

+    def _toc(self, from_upstream: ChunkerFromUpstream):
+        self.callback(random.randint(1, 5) / 100.0, "Start to chunk via `ToC`.")
+        if from_upstream.output_format in ["markdown", "text", "html"]:
+            return
+
+        # json
+        sections, section_images, page_1024, tc_arr = [], [], [""], [0]
+        for o in from_upstream.json_result or []:
+            txt = o.get("text", "")
+            tc = num_tokens_from_string(txt)
+            page_1024[-1] += "\n" + txt
+            tc_arr[-1] += tc
+            if tc_arr[-1] > 1024:
+                page_1024.append("")
+                tc_arr.append(0)
+            sections.append((o.get("text", ""), o.get("position_tag", "")))
+            section_images.append(o.get("image"))
+            print(len(sections), o)
+
+        llm_setting = self._param.llm_setting
+        chat_mdl = LLMBundle(self._canvas._tenant_id, LLMType.CHAT, llm_name=llm_setting["llm_id"], lang=llm_setting["lang"])
+        self.callback(random.randint(5, 15) / 100.0, "Start to detect table of contents...")
+        toc_secs = detect_table_of_contents(page_1024, chat_mdl)
+        if toc_secs:
+            self.callback(random.randint(25, 35) / 100.0, "Start to extract table of contents...")
+            toc_arr = toc_transformer(toc_secs, chat_mdl)
+            toc_arr = [it for it in toc_arr if it.get("structure")]
+            print(json.dumps(toc_arr, ensure_ascii=False, indent=2), flush=True)
+            self.callback(random.randint(35, 75) / 100.0, "Start to link table of contents...")
+            toc_arr = table_of_contents_index(toc_arr, [t for t,_ in sections], chat_mdl)
+            for i in range(len(toc_arr)-1):
+                if not toc_arr[i].get("indices"):
+                    continue
+
+                for j in range(i+1, len(toc_arr)):
+                    if toc_arr[j].get("indices"):
+                        if toc_arr[j]["indices"][0] - toc_arr[i]["indices"][-1] > 1:
+                            toc_arr[i]["indices"].extend([x for x in range(toc_arr[i]["indices"][-1]+1, toc_arr[j]["indices"][0])])
+                        break
+            # put all sections ahead of toc_arr[0] into it
+            # for i in range(len(toc_arr)):
+            #     if toc_arr[i].get("indices") and toc_arr[i]["indices"][0]:
+            #         toc_arr[i]["indices"] = [x for x in range(toc_arr[i]["indices"][-1]+1)]
+            #         break
+            # put all sections after toc_arr[-1] into it
+            for i in range(len(toc_arr)-1, -1, -1):
+                if toc_arr[i].get("indices") and toc_arr[i]["indices"][-1]:
+                    toc_arr[i]["indices"] = [x for x in range(toc_arr[i]["indices"][0], len(sections))]
+                    break
+            print(">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>\n", json.dumps(toc_arr, ensure_ascii=False, indent=2), flush=True)
+
+            chunks, images = [], []
+            for it in toc_arr:
+                if not it.get("indices"):
+                    continue
+                txt = ""
+                img = None
+                for i in it["indices"]:
+                    idx = i
+                    txt += "\n" + sections[idx][0] + "\t" + sections[idx][1]
+                    if img and section_images[idx]:
+                        img = concat_img(img, section_images[idx])
+                    elif section_images[idx]:
+                        img = section_images[idx]
+
+                it["indices"] = []
+                if not txt:
+                    continue
+                it["indices"] = [len(chunks)]
+                print(it, "KKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKK\n", txt)
+                chunks.append(txt)
+                images.append(img)
+            self.callback(1, "Done")
+            return [
+                {
+                    "text": RAGFlowPdfParser.remove_tag(c),
+                    "image": img,
+                    "positions": RAGFlowPdfParser.extract_positions(c),
+                }
+                for c, img in zip(chunks, images)
+            ]
+
+        self.callback(message="No table of contents detected.")
+
+
    async def _invoke(self, **kwargs):
        function_map = {
            "general": self._general,
@ -154,6 +241,7 @@ class Chunker(ProcessBase):
            "laws": self._laws,
            "presentation": self._presentation,
            "one": self._one,
+            "toc": self._toc,
        }

        try:
@ -167,7 +255,7 @@ class Chunker(ProcessBase):

        async def auto_keywords():
            nonlocal chunks, llm_setting
-            chat_mdl = LLMBundle(self._canvas._tenant_id, LLMType.CHAT, llm_name=llm_setting["llm_name"], lang=llm_setting["lang"])
+            chat_mdl = LLMBundle(self._canvas._tenant_id, LLMType.CHAT, llm_name=llm_setting["llm_id"], lang=llm_setting["lang"])

            async def doc_keyword_extraction(chat_mdl, ck, topn):
                cached = get_llm_cache(chat_mdl.llm_name, ck["text"], "keywords", {"topn": topn})
@ -184,7 +272,7 @@ class Chunker(ProcessBase):

        async def auto_questions():
            nonlocal chunks, llm_setting
-            chat_mdl = LLMBundle(self._canvas._tenant_id, LLMType.CHAT, llm_name=llm_setting["llm_name"], lang=llm_setting["lang"])
+            chat_mdl = LLMBundle(self._canvas._tenant_id, LLMType.CHAT, llm_name=llm_setting["llm_id"], lang=llm_setting["lang"])

            async def doc_question_proposal(chat_mdl, d, topn):
                cached = get_llm_cache(chat_mdl.llm_name, ck["text"], "question", {"topn": topn})
--- a/rag/flow/chunker/schema.py
+++ b/rag/flow/chunker/schema.py
@ -22,7 +22,7 @@ class ChunkerFromUpstream(BaseModel):
    elapsed_time: float | None = Field(default=None, alias="_elapsed_time")

    name: str
-    blob: bytes
+    file: dict | None = Field(default=None)

    output_format: Literal["json", "markdown", "text", "html"] | None = Field(default=None)

--- a/rag/flow/file.py
+++ b/rag/flow/file.py
@ -14,10 +14,7 @@
 #  limitations under the License.
 #
 from api.db.services.document_service import DocumentService
-from api.db.services.file2document_service import File2DocumentService
-from api.db.services.file_service import FileService
 from rag.flow.base import ProcessBase, ProcessParamBase
-from rag.utils.storage_factory import STORAGE_IMPL


 class FileParam(ProcessParamBase):
@ -41,10 +38,13 @@ class File(ProcessBase):
                self.set_output("_ERROR", f"Document({self._canvas._doc_id}) not found!")
                return

-            b, n = File2DocumentService.get_storage_address(doc_id=self._canvas._doc_id)
-            self.set_output("blob", STORAGE_IMPL.get(b, n))
+            #b, n = File2DocumentService.get_storage_address(doc_id=self._canvas._doc_id)
+            #self.set_output("blob", STORAGE_IMPL.get(b, n))
            self.set_output("name", doc.name)
        else:
            file = kwargs.get("file")
            self.set_output("name", file["name"])
-            self.set_output("blob", FileService.get_blob(file["created_by"], file["id"]))
+            self.set_output("file", file)
+            #self.set_output("blob", FileService.get_blob(file["created_by"], file["id"]))
+
+        self.callback(1, "File fetched.")
--- a/rag/flow/hierarchical_merger/init.py
+++ b/rag/flow/hierarchical_merger/init.py
@ -0,0 +1,15 @@
+#
+#  Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
--- a/rag/flow/hierarchical_merger/hierarchical_merger.py
+++ b/rag/flow/hierarchical_merger/hierarchical_merger.py
@ -0,0 +1,178 @@
+#
+#  Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+import json
+import random
+import re
+from copy import deepcopy
+from functools import partial
+
+import trio
+
+from api.utils import get_uuid
+from api.utils.base64_image import id2image, image2id
+from deepdoc.parser.pdf_parser import RAGFlowPdfParser
+from rag.flow.base import ProcessBase, ProcessParamBase
+from rag.flow.hierarchical_merger.schema import HierarchicalMergerFromUpstream
+from rag.nlp import concat_img
+from rag.utils.storage_factory import STORAGE_IMPL
+
+
+class HierarchicalMergerParam(ProcessParamBase):
+    def __init__(self):
+        super().__init__()
+        self.levels = []
+        self.hierarchy = None
+
+    def check(self):
+        self.check_empty(self.levels, "Hierarchical setups.")
+        self.check_empty(self.hierarchy, "Hierarchy number.")
+
+    def get_input_form(self) -> dict[str, dict]:
+        return {}
+
+
+class HierarchicalMerger(ProcessBase):
+    component_name = "HierarchicalMerger"
+
+    async def _invoke(self, **kwargs):
+        try:
+            from_upstream = HierarchicalMergerFromUpstream.model_validate(kwargs)
+        except Exception as e:
+            self.set_output("_ERROR", f"Input error: {str(e)}")
+            return
+
+        self.callback(random.randint(1, 5) / 100.0, "Start to merge hierarchically.")
+        if from_upstream.output_format in ["markdown", "text", "html"]:
+            if from_upstream.output_format == "markdown":
+                payload = from_upstream.markdown_result
+            elif from_upstream.output_format == "text":
+                payload = from_upstream.text_result
+            else:  # == "html"
+                payload = from_upstream.html_result
+
+            if not payload:
+                payload = ""
+
+            lines = [ln for ln in payload.split("\n") if ln]
+        else:
+            lines = [o.get("text", "") for o in from_upstream.json_result]
+            sections, section_images = [], []
+            for o in from_upstream.json_result or []:
+                sections.append((o.get("text", ""), o.get("position_tag", "")))
+                section_images.append(o.get("img_id"))
+
+        matches = []
+        for txt in lines:
+            good = False
+            for lvl, regs in enumerate(self._param.levels):
+                for reg in regs:
+                    if re.search(reg, txt):
+                        matches.append(lvl)
+                        good = True
+                        break
+                if good:
+                    break
+            if not good:
+                matches.append(len(self._param.levels))
+        assert len(matches) == len(lines), f"{len(matches)} vs. {len(lines)}"
+
+        root = {
+            "level": -1,
+            "index": -1,
+            "texts": [],
+            "children": []
+        }
+        for i, m in enumerate(matches):
+            if m == 0:
+                root["children"].append({
+                    "level": m,
+                    "index": i,
+                    "texts": [],
+                    "children": []
+                })
+            elif m == len(self._param.levels):
+                def dfs(b):
+                    if not b["children"]:
+                        b["texts"].append(i)
+                    else:
+                        dfs(b["children"][-1])
+                dfs(root)
+            else:
+                def dfs(b):
+                    nonlocal m, i
+                    if not b["children"] or  m == b["level"] + 1:
+                        b["children"].append({
+                            "level": m,
+                            "index": i,
+                            "texts": [],
+                            "children": []
+                        })
+                        return
+                    dfs(b["children"][-1])
+
+                dfs(root)
+
+        all_pathes = []
+        def dfs(n, path, depth):
+            nonlocal all_pathes
+            if depth < self._param.hierarchy:
+                path = deepcopy(path)
+
+            for nn in n["children"]:
+                path.extend([nn["index"], *nn["texts"]])
+                dfs(nn, path, depth+1)
+
+            if depth == self._param.hierarchy:
+                all_pathes.append(path)
+
+        for i in range(len(lines)):
+            print(i, lines[i])
+        dfs(root, [], 0)
+        print("sSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSS", json.dumps(root, ensure_ascii=False, indent=2))
+
+        if from_upstream.output_format in ["markdown", "text", "html"]:
+            cks = []
+            for path in all_pathes:
+                txt = ""
+                for i in path:
+                    txt += lines[i] + "\n"
+                cks.append(txt)
+
+            self.set_output("chunks", [{"text": c} for c in cks if c])
+        else:
+            cks = []
+            images = []
+            for path in all_pathes:
+                txt = ""
+                img = None
+                for i in path:
+                    txt += lines[i] + "\n"
+                    concat_img(img, id2image(section_images[i], partial(STORAGE_IMPL.get)))
+                cks.append(cks)
+                images.append(img)
+
+            cks = [
+                {
+                    "text": RAGFlowPdfParser.remove_tag(c),
+                    "image": img,
+                    "positions": RAGFlowPdfParser.extract_positions(c),
+                }
+                for c, img in zip(cks, images)
+            ]
+            async with trio.open_nursery() as nursery:
+                for d in cks:
+                    nursery.start_soon(image2id, d, partial(STORAGE_IMPL.put), "_image_temps", get_uuid())
+
+        self.callback(1, "Done.")
--- a/rag/flow/hierarchical_merger/schema.py
+++ b/rag/flow/hierarchical_merger/schema.py
@ -0,0 +1,37 @@
+#
+#  Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+from typing import Any, Literal
+
+from pydantic import BaseModel, ConfigDict, Field
+
+
+class HierarchicalMergerFromUpstream(BaseModel):
+    created_time: float | None = Field(default=None, alias="_created_time")
+    elapsed_time: float | None = Field(default=None, alias="_elapsed_time")
+
+    name: str
+    file: dict | None = Field(default=None)
+    chunks: list[dict[str, Any]] | None = Field(default=None)
+
+    output_format: Literal["json", "markdown", "text", "html"] | None = Field(default=None)
+    json_result: list[dict[str, Any]] | None = Field(default=None, alias="json")
+    markdown_result: str | None = Field(default=None, alias="markdown")
+    text_result: str | None = Field(default=None, alias="text")
+    html_result: list[str] | None = Field(default=None, alias="html")
+
+    model_config = ConfigDict(populate_by_name=True, extra="forbid")
+
+    # def to_dict(self, *, exclude_none: bool = True) -> dict:
+    #     return self.model_dump(by_alias=True, exclude_none=exclude_none)
--- a/rag/flow/parser/parser.py
+++ b/rag/flow/parser/parser.py
@ -12,18 +12,27 @@
 #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
+import io
 import logging
 import random
+from functools import partial

 import trio
+import numpy as np
+from PIL import Image

 from api.db import LLMType
+from api.db.services.file2document_service import File2DocumentService
+from api.db.services.file_service import FileService
 from api.db.services.llm_service import LLMBundle
+from api.utils import get_uuid
+from api.utils.base64_image import image2id
 from deepdoc.parser import ExcelParser
 from deepdoc.parser.pdf_parser import PlainParser, RAGFlowPdfParser, VisionParser
 from rag.flow.base import ProcessBase, ProcessParamBase
 from rag.flow.parser.schema import ParserFromUpstream
 from rag.llm.cv_model import Base as VLM
+from rag.utils.storage_factory import STORAGE_IMPL


 class ParserParam(ProcessParamBase):
@ -43,17 +52,24 @@ class ParserParam(ProcessParamBase):
                "json",
            ],
            "ppt": [],
-            "image": [],
+            "image": [
+                "text"
+            ],
            "email": [],
-            "text": [],
-            "audio": [],
+            "text": [
+                "text",
+                "json"
+            ],
+            "audio": [
+                "json"
+            ],
            "video": [],
        }

        self.setups = {
            "pdf": {
                "parse_method": "deepdoc",  # deepdoc/plain_text/vlm
-                "vlm_name": "",
+                "llm_id": "",
                "lang": "Chinese",
                "suffix": [
                    "pdf",
@ -76,16 +92,46 @@ class ParserParam(ProcessParamBase):
                "output_format": "json",
            },
            "markdown": {
-                "suffix": ["md", "markdown"],
+                "suffix": ["md", "markdown", "mdx"],
                "output_format": "json",
            },
            "ppt": {},
            "image": {
                "parse_method": "ocr",
+                "llm_id": "",
+                "lang": "Chinese",
+                "suffix": ["jpg", "jpeg", "png", "gif"],
+                "output_format": "json",
+            },
+            "email": {
+                "fields": []
+            },
+            "text": {
+                "suffix": [
+                    "txt"
+                ],
+                "output_format": "json",
+            },
+            "audio": {
+                "suffix":[
+                    "da",
+                    "wave",
+                    "wav",
+                    "mp3",
+                    "aac",
+                    "flac",
+                    "ogg",
+                    "aiff",
+                    "au",
+                    "midi",
+                    "wma",
+                    "realaudio",
+                    "vqf",
+                    "oggvorbis",
+                    "ape"
+                ],
+                "output_format": "json",
            },
-            "email": {},
-            "text": {},
-            "audio": {},
            "video": {},
        }

@ -96,7 +142,7 @@ class ParserParam(ProcessParamBase):
            self.check_valid_value(pdf_parse_method.lower(), "Parse method abnormal.", ["deepdoc", "plain_text", "vlm"])

            if pdf_parse_method not in ["deepdoc", "plain_text"]:
-                self.check_empty(pdf_config.get("vlm_name"), "VLM")
+                self.check_empty(pdf_config.get("llm_id"), "VLM")

            pdf_language = pdf_config.get("lang", "")
            self.check_empty(pdf_language, "Language")
@ -117,7 +163,23 @@ class ParserParam(ProcessParamBase):
        image_config = self.setups.get("image", "")
        if image_config:
            image_parse_method = image_config.get("parse_method", "")
-            self.check_valid_value(image_parse_method.lower(), "Parse method abnormal.", ["ocr"])
+            self.check_valid_value(image_parse_method.lower(), "Parse method abnormal.", ["ocr", "vlm"])
+            if image_parse_method not in ["ocr"]:
+                self.check_empty(image_config.get("llm_id"), "VLM")
+
+            image_language = image_config.get("lang", "")
+            self.check_empty(image_language, "Language")
+
+        text_config = self.setups.get("text", "")
+        if text_config:
+            text_output_format = text_config.get("output_format", "")
+            self.check_valid_value(text_output_format, "Text output format abnormal.", self.allowed_output_format["text"])
+
+        audio_config = self.setups.get("audio", "")
+        if audio_config:
+            self.check_empty(audio_config.get("llm_id"), "VLM")
+            audio_language = audio_config.get("lang", "")
+            self.check_empty(audio_language, "Language")

    def get_input_form(self) -> dict[str, dict]:
        return {}
@ -126,10 +188,8 @@ class ParserParam(ProcessParamBase):
 class Parser(ProcessBase):
    component_name = "Parser"

-    def _pdf(self, from_upstream: ParserFromUpstream):
+    def _pdf(self, name, blob):
        self.callback(random.randint(1, 5) / 100.0, "Start to work on a PDF.")
-
-        blob = from_upstream.blob
        conf = self._param.setups["pdf"]
        self.set_output("output_format", conf["output_format"])

@ -139,8 +199,8 @@ class Parser(ProcessBase):
            lines, _ = PlainParser()(blob)
            bboxes = [{"text": t} for t, _ in lines]
        else:
-            assert conf.get("vlm_name")
-            vision_model = LLMBundle(self._canvas._tenant_id, LLMType.IMAGE2TEXT, llm_name=conf.get("vlm_name"), lang=self._param.setups["pdf"].get("lang"))
+            assert conf.get("llm_id")
+            vision_model = LLMBundle(self._canvas._tenant_id, LLMType.IMAGE2TEXT, llm_name=conf.get("llm_id"), lang=self._param.setups["pdf"].get("lang"))
            lines, _ = VisionParser(vision_model=vision_model)(blob, callback=self.callback)
            bboxes = []
            for t, poss in lines:
@ -149,6 +209,7 @@ class Parser(ProcessBase):

        if conf.get("output_format") == "json":
            self.set_output("json", bboxes)
+
        if conf.get("output_format") == "markdown":
            mkdn = ""
            for b in bboxes:
@ -160,14 +221,10 @@ class Parser(ProcessBase):
                mkdn += b.get("text", "") + "\n"
            self.set_output("markdown", mkdn)

-    def _spreadsheet(self, from_upstream: ParserFromUpstream):
+    def _spreadsheet(self, name, blob):
        self.callback(random.randint(1, 5) / 100.0, "Start to work on a Spreadsheet.")
-
-        blob = from_upstream.blob
        conf = self._param.setups["spreadsheet"]
        self.set_output("output_format", conf["output_format"])
-
-        print("spreadsheet {conf=}", flush=True)
        spreadsheet_parser = ExcelParser()
        if conf.get("output_format") == "html":
            html = spreadsheet_parser.html(blob, 1000000000)
@ -177,19 +234,13 @@ class Parser(ProcessBase):
        elif conf.get("output_format") == "markdown":
            self.set_output("markdown", spreadsheet_parser.markdown(blob))

-    def _word(self, from_upstream: ParserFromUpstream):
+    def _word(self, name, blob):
        from tika import parser as  word_parser

        self.callback(random.randint(1, 5) / 100.0, "Start to work on a Word Processor Document")
-
-        blob = from_upstream.blob
-        name = from_upstream.name
        conf = self._param.setups["word"]
        self.set_output("output_format", conf["output_format"])
-
-        print("word {conf=}", flush=True)
        doc_parsed = word_parser.from_buffer(blob)
-
        sections = []
        if doc_parsed.get("content"):
            sections = doc_parsed["content"].split("\n")
@ -202,26 +253,18 @@ class Parser(ProcessBase):
        if conf.get("output_format") == "json":
            self.set_output("json", sections)

-    def _markdown(self, from_upstream: ParserFromUpstream):
+    def _markdown(self, name, blob):
        from functools import reduce
-
        from rag.app.naive import Markdown as naive_markdown_parser
        from rag.nlp import concat_img

-        self.callback(random.randint(1, 5) / 100.0, "Start to work on a Word Processor Document")
-
-        blob = from_upstream.blob
-        name = from_upstream.name
+        self.callback(random.randint(1, 5) / 100.0, "Start to work on a markdown.")
        conf = self._param.setups["markdown"]
        self.set_output("output_format", conf["output_format"])

-        print("markdown {conf=}", flush=True)
-
        markdown_parser = naive_markdown_parser()
        sections, tables = markdown_parser(name, blob, separate_tables=False)

-        # json
-        assert conf.get("output_format") == "json", "have to be json for doc"
        if conf.get("output_format") == "json":
            json_results = []

@ -239,14 +282,86 @@ class Parser(ProcessBase):
                json_results.append(json_result)

            self.set_output("json", json_results)
+        else:
+            self.set_output("text", "\n".join([section_text for section_text, _ in sections]))

+    def _text(self, name, blob):
+        from deepdoc.parser.utils import get_text
+
+        self.callback(random.randint(1, 5) / 100.0, "Start to work on a text.")
+        conf = self._param.setups["text"]
+        self.set_output("output_format", conf["output_format"])
+
+        # parse binary to text
+        text_content = get_text(name, binary=blob)
+
+        if conf.get("output_format") == "json":
+            result = [{"text": text_content}]
+            self.set_output("json", result)
+        else:
+            result = text_content
+            self.set_output("text", result)
+
+    def _image(self, from_upstream: ParserFromUpstream):
+        from deepdoc.vision import OCR
+
+        self.callback(random.randint(1, 5) / 100.0, "Start to work on an image.")
+
+        blob = from_upstream.blob
+        conf = self._param.setups["image"]
+        self.set_output("output_format", conf["output_format"])
+
+        img = Image.open(io.BytesIO(blob)).convert("RGB")
+        lang = conf["lang"]
+
+        if conf["parse_method"] == "ocr":
+            # use ocr, recognize chars only
+            ocr = OCR()
+            bxs = ocr(np.array(img))  # return boxes and recognize result
+            txt = "\n".join([t[0] for _, t in bxs if t[0]])
+
+        else:
+            # use VLM to describe the picture
+            cv_model = LLMBundle(self._canvas.get_tenant_id(), LLMType.IMAGE2TEXT, llm_name=conf["llm_id"],lang=lang)
+            img_binary = io.BytesIO()
+            img.save(img_binary, format="JPEG")
+            img_binary.seek(0)
+            txt = cv_model.describe(img_binary.read())
+
+        self.set_output("text", txt)
+
+    def _audio(self, from_upstream: ParserFromUpstream):
+        import os
+        import tempfile
+
+        self.callback(random.randint(1, 5) / 100.0, "Start to work on an audio.")
+
+        blob = from_upstream.blob
+        name = from_upstream.name
+        conf = self._param.setups["audio"]
+        self.set_output("output_format", conf["output_format"])
+
+        lang = conf["lang"]
+        _, ext = os.path.splitext(name)
+        with tempfile.NamedTemporaryFile(suffix=ext) as tmpf:
+            tmpf.write(blob)
+            tmpf.flush()
+            tmp_path = os.path.abspath(tmpf.name)
+
+            seq2txt_mdl = LLMBundle(self._canvas.get_tenant_id(), LLMType.SPEECH2TEXT, lang=lang)
+            txt = seq2txt_mdl.transcription(tmp_path)
+
+            self.set_output("text", txt)

    async def _invoke(self, **kwargs):
        function_map = {
            "pdf": self._pdf,
            "markdown": self._markdown,
            "spreadsheet": self._spreadsheet,
-            "word": self._word
+            "word": self._word,
+            "text": self._text,
+            "image": self._image,
+            "audio": self._audio,
        }
        try:
            from_upstream = ParserFromUpstream.model_validate(kwargs)
@ -254,8 +369,20 @@ class Parser(ProcessBase):
            self.set_output("_ERROR", f"Input error: {str(e)}")
            return

+        name = from_upstream.name
+        if self._canvas._doc_id:
+            b, n = File2DocumentService.get_storage_address(doc_id=self._canvas._doc_id)
+            blob = STORAGE_IMPL.get(b, n)
+        else:
+            blob = FileService.get_blob(from_upstream.file["created_by"], from_upstream.file["id"])
+
        for p_type, conf in self._param.setups.items():
            if from_upstream.name.split(".")[-1].lower() not in conf.get("suffix", []):
                continue
-            await trio.to_thread.run_sync(function_map[p_type], from_upstream)
+            await trio.to_thread.run_sync(function_map[p_type], name, blob)
            break
+
+        outs = self.output()
+        async with trio.open_nursery() as nursery:
+            for d in outs.get("json", []):
+                nursery.start_soon(image2id, d, partial(STORAGE_IMPL.put), "_image_temps", get_uuid())
--- a/rag/flow/parser/schema.py
+++ b/rag/flow/parser/schema.py
@ -20,6 +20,5 @@ class ParserFromUpstream(BaseModel):
    elapsed_time: float | None = Field(default=None, alias="_elapsed_time")

    name: str
-    blob: bytes
-
+    file: dict | None = Field(default=None)
    model_config = ConfigDict(populate_by_name=True, extra="forbid")
--- a/rag/flow/pipeline.py
+++ b/rag/flow/pipeline.py
@ -48,7 +48,24 @@ class Pipeline(Graph):
                    obj.append({"component_name": component_name, "trace": [{"progress": progress, "message": message, "datetime": datetime.datetime.now().strftime("%H:%M:%S")}]})
            else:
                obj = [{"component_name": component_name, "trace": [{"progress": progress, "message": message, "datetime": datetime.datetime.now().strftime("%H:%M:%S")}]}]
-            REDIS_CONN.set_obj(log_key, obj, 60 * 10)
+            REDIS_CONN.set_obj(log_key, obj, 60 * 30)
+            if self._doc_id:
+                percentage = 1./len(self.components.items())
+                msg = ""
+                finished = 0.
+                for o in obj:
+                    if o['component_name'] == "END":
+                        continue
+                    msg += f"\n[{o['component_name']}]:\n"
+                    for t in o["trace"]:
+                        msg += "%s: %s\n"%(t["datetime"], t["message"])
+                        if t["progress"] < 0:
+                            finished = -1
+                            break
+                    if finished < 0:
+                        break
+                    finished += o["trace"][-1]["progress"] * percentage
+                DocumentService.update_by_id(self._doc_id, {"progress": finished, "progress_msg": msg})
        except Exception as e:
            logging.exception(e)

@ -108,5 +125,11 @@ class Pipeline(Graph):
            idx += 1
            self.path.extend(cpn_obj.get_downstream())

+        self.callback("END", 1, json.dumps(self.get_component_obj(self.path[-1]).output(), ensure_ascii=False))
+
        if self._doc_id:
-            DocumentService.update_by_id(self._doc_id, {"progress": 1 if not self.error else -1, "progress_msg": "Pipeline finished...\n" + self.error, "process_duration": time.perf_counter() - st})
+            DocumentService.update_by_id(self._doc_id,{
+                "progress": 1 if not self.error else -1,
+                "progress_msg": "Pipeline finished...\n" + self.error,
+                "process_duration": time.perf_counter() - st
+            })
--- a/rag/flow/splitter/init.py
+++ b/rag/flow/splitter/init.py
@ -0,0 +1,15 @@
+#
+#  Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
--- a/rag/flow/splitter/schema.py
+++ b/rag/flow/splitter/schema.py
@ -0,0 +1,38 @@
+#
+#  Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+from typing import Any, Literal
+
+from pydantic import BaseModel, ConfigDict, Field
+
+
+class SplitterFromUpstream(BaseModel):
+    created_time: float | None = Field(default=None, alias="_created_time")
+    elapsed_time: float | None = Field(default=None, alias="_elapsed_time")
+
+    name: str
+    file: dict | None = Field(default=None)
+    chunks: list[dict[str, Any]] | None = Field(default=None)
+
+    output_format: Literal["json", "markdown", "text", "html"] | None = Field(default=None)
+
+    json_result: list[dict[str, Any]] | None = Field(default=None, alias="json")
+    markdown_result: str | None = Field(default=None, alias="markdown")
+    text_result: str | None = Field(default=None, alias="text")
+    html_result: list[str] | None = Field(default=None, alias="html")
+
+    model_config = ConfigDict(populate_by_name=True, extra="forbid")
+
+    # def to_dict(self, *, exclude_none: bool = True) -> dict:
+    #     return self.model_dump(by_alias=True, exclude_none=exclude_none)
--- a/rag/flow/splitter/splitter.py
+++ b/rag/flow/splitter/splitter.py
@ -0,0 +1,112 @@
+#
+#  Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+import json
+import random
+from functools import partial
+
+import trio
+
+from api.utils import get_uuid
+from api.utils.base64_image import id2image, image2id
+from deepdoc.parser.pdf_parser import RAGFlowPdfParser
+from rag.flow.base import ProcessBase, ProcessParamBase
+from rag.flow.splitter.schema import SplitterFromUpstream
+from rag.nlp import naive_merge, naive_merge_with_images
+from rag.utils.storage_factory import STORAGE_IMPL
+
+
+class SplitterParam(ProcessParamBase):
+    def __init__(self):
+        super().__init__()
+        self.chunk_token_size = 512
+        self.delimiters = ["\n"]
+        self.overlapped_percent = 0
+
+    def check(self):
+        self.check_empty(self.delimiters, "Delimiters.")
+        self.check_positive_integer(self.chunk_token_size, "Chunk token size.")
+        self.check_decimal_float(self.overlapped_percent, "Overlapped percentage: [0, 1)")
+
+    def get_input_form(self) -> dict[str, dict]:
+        return {}
+
+
+class Splitter(ProcessBase):
+    component_name = "Splitter"
+
+    async def _invoke(self, **kwargs):
+        try:
+            from_upstream = SplitterFromUpstream.model_validate(kwargs)
+        except Exception as e:
+            self.set_output("_ERROR", f"Input error: {str(e)}")
+            return
+
+        deli = ""
+        for d in self._param.delimiters:
+            if len(d) > 1:
+                deli += f"`{d}`"
+            else:
+                deli += d
+
+        self.callback(random.randint(1, 5) / 100.0, "Start to split into chunks.")
+        if from_upstream.output_format in ["markdown", "text", "html"]:
+            if from_upstream.output_format == "markdown":
+                payload = from_upstream.markdown_result
+            elif from_upstream.output_format == "text":
+                payload = from_upstream.text_result
+            else:  # == "html"
+                payload = from_upstream.html_result
+
+            if not payload:
+                payload = ""
+
+            cks = naive_merge(
+                payload,
+                self._param.chunk_token_size,
+                deli,
+                self._param.overlapped_percent,
+            )
+            self.set_output("chunks", [{"text": c} for c in cks])
+
+            self.callback(1, "Done.")
+            return
+
+        # json
+        sections, section_images = [], []
+        for o in from_upstream.json_result or []:
+            sections.append((o.get("text", ""), o.get("position_tag", "")))
+            section_images.append(id2image(o.get("img_id"), partial(STORAGE_IMPL.get)))
+
+        chunks, images = naive_merge_with_images(
+            sections,
+            section_images,
+            self._param.chunk_token_size,
+            deli,
+            self._param.overlapped_percent,
+        )
+        cks = [
+            {
+                "text": RAGFlowPdfParser.remove_tag(c),
+                "image": img,
+                "positions": RAGFlowPdfParser.extract_positions(c),
+            }
+            for c, img in zip(chunks, images)
+        ]
+        async with trio.open_nursery() as nursery:
+            for d in cks:
+                nursery.start_soon(image2id, d, partial(STORAGE_IMPL.put), "_image_temps", get_uuid())
+        print("SSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSSS\n", json.dumps(cks, ensure_ascii=False, indent=2))
+        self.set_output("chunks",  cks)
+        self.callback(1, "Done.")
--- a/rag/flow/tests/dsl_examples/general_pdf_all.json
+++ b/rag/flow/tests/dsl_examples/general_pdf_all.json
@ -44,20 +44,58 @@
                    "markdown"
                  ],
                  "output_format": "json"
+                },
+                "text": {
+                  "suffix": ["txt"],
+                  "output_format": "json"
+                },
+                "image": {
+                  "parse_method": "vlm",
+                  "llm_id":"glm-4.5v",
+                  "lang": "Chinese",
+                  "suffix": [
+                    "jpg",
+                    "jpeg",
+                    "png",
+                    "gif"
+                  ],
+                  "output_format": "text"
+                },
+                "audio": {
+                  "suffix": [
+                    "da",
+                    "wave",
+                    "wav",
+                    "mp3",
+                    "aac",
+                    "flac",
+                    "ogg",
+                    "aiff",
+                    "au",
+                    "midi",
+                    "wma",
+                    "realaudio",
+                    "vqf",
+                    "oggvorbis",
+                    "ape"
+                  ],
+                  "lang": "Chinese",
+                  "llm_id": "SenseVoiceSmall",
+                  "output_format": "json"
                }
              }
-            }
          }
        },
-        "downstream": ["Chunker:0"],
+        "downstream": ["Splitter:0"],
        "upstream": ["Begin"]
    },
-    "Chunker:0": {
+    "Splitter:0": {
        "obj": {
-            "component_name": "Chunker",
+            "component_name": "Splitter",
            "params": {
-              "method": "general",
-              "auto_keywords": 5
+              "chunk_token_size": 512,
+              "delimiters": ["\n"],
+              "overlapped_percent": 0
            }
        },
        "downstream": ["Tokenizer:0"],
--- a/rag/flow/tests/dsl_examples/hierarchical_merger.json
+++ b/rag/flow/tests/dsl_examples/hierarchical_merger.json
@ -0,0 +1,84 @@
+{
+  "components": {
+    "File": {
+        "obj":{
+            "component_name": "File",
+            "params": {
+            }
+        },
+        "downstream": ["Parser:0"],
+        "upstream": []
+    },
+    "Parser:0": {
+        "obj": {
+            "component_name": "Parser",
+            "params": {
+              "setups": {
+                "pdf": {
+                  "parse_method": "deepdoc",
+                  "vlm_name": "",
+                  "lang": "Chinese",
+                  "suffix": [
+                    "pdf"
+                  ],
+                  "output_format": "json"
+                },
+                "spreadsheet": {
+                  "suffix": [
+                    "xls",
+                    "xlsx",
+                    "csv"
+                  ],
+                  "output_format": "html"
+                },
+                "word": {
+                  "suffix": [
+                    "doc",
+                    "docx"
+                  ],
+                  "output_format": "json"
+                },
+                "markdown": {
+                  "suffix": [
+                    "md",
+                    "markdown"
+                  ],
+                  "output_format": "text"
+                },
+                "text": {
+                  "suffix": ["txt"],
+                  "output_format": "json"
+                }
+              }
+          }
+        },
+        "downstream": ["Splitter:0"],
+        "upstream": ["File"]
+    },
+    "Splitter:0": {
+        "obj": {
+            "component_name": "Splitter",
+            "params": {
+              "chunk_token_size": 512,
+              "delimiters": ["\r\n"],
+              "overlapped_percent": 0
+            }
+        },
+        "downstream": ["HierarchicalMerger:0"],
+        "upstream": ["Parser:0"]
+    },
+    "HierarchicalMerger:0": {
+        "obj": {
+            "component_name": "HierarchicalMerger",
+            "params": {
+              "levels": [["^#[^#]"], ["^##[^#]"], ["^###[^#]"], ["^####[^#]"]],
+              "hierarchy": 2
+            }
+        },
+        "downstream": [],
+        "upstream": ["Splitter:0"]
+    }
+  },
+  "path": []
+}
+
--- a/rag/flow/tokenizer/schema.py
+++ b/rag/flow/tokenizer/schema.py
@ -22,7 +22,7 @@ class TokenizerFromUpstream(BaseModel):
    elapsed_time: float | None = Field(default=None, alias="_elapsed_time")

    name: str = ""
-    blob: bytes
+    file: dict | None = Field(default=None)

    output_format: Literal["json", "markdown", "text", "html"] | None = Field(default=None)

--- a/rag/flow/tokenizer/tokenizer.py
+++ b/rag/flow/tokenizer/tokenizer.py
@ -37,6 +37,7 @@ class TokenizerParam(ProcessParamBase):
        super().__init__()
        self.search_method = ["full_text", "embedding"]
        self.filename_embd_weight = 0.1
+        self.fields = ["text"]

    def check(self):
        for v in self.search_method:
@ -61,10 +62,14 @@ class Tokenizer(ProcessBase):
        embedding_model = LLMBundle(self._canvas._tenant_id, LLMType.EMBEDDING, llm_name=embedding_id)
        texts = []
        for c in chunks:
-            if c.get("questions"):
-                texts.append("\n".join(c["questions"]))
-            else:
-                texts.append(re.sub(r"</?(table|td|caption|tr|th)( [^<>]{0,12})?>", " ", c["text"]))
+            txt = ""
+            for f in self._param.fields:
+                f = c.get(f)
+                if isinstance(f, str):
+                    txt += f
+                elif isinstance(f, list):
+                    txt += "\n".join(f)
+            texts.append(re.sub(r"</?(table|td|caption|tr|th)( [^<>]{0,12})?>", " ", txt))
        vts, c = embedding_model.encode([name])
        token_count += c
        tts = np.concatenate([vts[0] for _ in range(len(texts))], axis=0)