Feat: Use data pipeline to visualize the parsing configuration of the knowledge base (#10423)

### What problem does this PR solve? #9869 ### Type of change - [x] New Feature (non-breaking change which adds functionality) --------- Signed-off-by: dependabot[bot] <support@github.com> Signed-off-by: jinhai <haijin.chn@gmail.com> Signed-off-by: Jin Hai <haijin.chn@gmail.com> Co-authored-by: chanx <1243304602@qq.com> Co-authored-by: balibabu <cike8899@users.noreply.github.com> Co-authored-by: Lynn <lynn_inf@hotmail.com> Co-authored-by: 纷繁下的无奈 <zhileihuang@126.com> Co-authored-by: huangzl <huangzl@shinemo.com> Co-authored-by: writinwaters <93570324+writinwaters@users.noreply.github.com> Co-authored-by: Wilmer <33392318@qq.com> Co-authored-by: Adrian Weidig <adrianweidig@gmx.net> Co-authored-by: Zhichang Yu <yuzhichang@gmail.com> Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> Co-authored-by: Yongteng Lei <yongtengrey@outlook.com> Co-authored-by: Liu An <asiro@qq.com> Co-authored-by: buua436 <66937541+buua436@users.noreply.github.com> Co-authored-by: BadwomanCraZY <511528396@qq.com> Co-authored-by: cucusenok <31804608+cucusenok@users.noreply.github.com> Co-authored-by: Russell Valentine <russ@coldstonelabs.org> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: Billy Bao <newyorkupperbay@gmail.com> Co-authored-by: Zhedong Cen <cenzhedong2@126.com> Co-authored-by: TensorNull <129579691+TensorNull@users.noreply.github.com> Co-authored-by: TensorNull <tensor.null@gmail.com> Co-authored-by: TeslaZY <TeslaZY@outlook.com> Co-authored-by: Ajay <160579663+aybanda@users.noreply.github.com> Co-authored-by: AB <aj@Ajays-MacBook-Air.local> Co-authored-by: 天海蒼灆 <huangaoqin@tecpie.com> Co-authored-by: He Wang <wanghechn@qq.com> Co-authored-by: Atsushi Hatakeyama <atu729@icloud.com> Co-authored-by: Jin Hai <haijin.chn@gmail.com> Co-authored-by: Mohamed Mathari <155896313+melmathari@users.noreply.github.com> Co-authored-by: Mohamed Mathari <nocodeventure@Mac-mini-van-Mohamed.fritz.box> Co-authored-by: Stephen Hu <stephenhu@seismic.com> Co-authored-by: Shaun Zhang <zhangwfjh@users.noreply.github.com> Co-authored-by: zhimeng123 <60221886+zhimeng123@users.noreply.github.com> Co-authored-by: mxc <mxc@example.com> Co-authored-by: Dominik Novotný <50611433+SgtMarmite@users.noreply.github.com> Co-authored-by: EVGENY M <168018528+rjohny55@users.noreply.github.com> Co-authored-by: mcoder6425 <mcoder64@gmail.com> Co-authored-by: lemsn <lemsn@msn.com> Co-authored-by: lemsn <lemsn@126.com> Co-authored-by: Adrian Gora <47756404+adagora@users.noreply.github.com> Co-authored-by: Womsxd <45663319+Womsxd@users.noreply.github.com> Co-authored-by: FatMii <39074672+FatMii@users.noreply.github.com>
2025-12-08 20:42:30 +08:00 · 2025-10-09 12:36:19 +08:00
parent ef0aecea3b
commit cbf04ee470
490 changed files with 10630 additions and 30688 deletions
--- a/rag/flow/parser/parser.py
+++ b/rag/flow/parser/parser.py
@ -13,20 +13,28 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 import io
-import logging
+import json
+import os
 import random
+from functools import partial

 import trio
 import numpy as np
 from PIL import Image

 from api.db import LLMType
+from api.db.services.file2document_service import File2DocumentService
+from api.db.services.file_service import FileService
 from api.db.services.llm_service import LLMBundle
+from api.utils import get_uuid
+from api.utils.base64_image import image2id
 from deepdoc.parser import ExcelParser
 from deepdoc.parser.pdf_parser import PlainParser, RAGFlowPdfParser, VisionParser
+from rag.app.naive import Docx
 from rag.flow.base import ProcessBase, ProcessParamBase
 from rag.flow.parser.schema import ParserFromUpstream
 from rag.llm.cv_model import Base as VLM
+from rag.utils.storage_factory import STORAGE_IMPL


 class ParserParam(ProcessParamBase):
@ -45,12 +53,14 @@ class ParserParam(ProcessParamBase):
            "word": [
                "json",
            ],
-            "ppt": [],
+            "slides": [
+                "json",
+            ],
            "image": [
                "text"
            ],
-            "email": [],
-            "text": [
+            "email": ["text", "json"],
+            "text&markdown": [
                "text",
                "json"
            ],
@ -63,7 +73,6 @@ class ParserParam(ProcessParamBase):
        self.setups = {
            "pdf": {
                "parse_method": "deepdoc",  # deepdoc/plain_text/vlm
-                "llm_id": "",
                "lang": "Chinese",
                "suffix": [
                    "pdf",
@ -85,23 +94,29 @@ class ParserParam(ProcessParamBase):
                ],
                "output_format": "json",
            },
-            "markdown": {
-                "suffix": ["md", "markdown"],
+            "text&markdown": {
+                "suffix": ["md", "markdown", "mdx", "txt"],
+                "output_format": "json",
+            },
+            "slides": {
+                "suffix": [
+                    "pptx",
+                ],
                "output_format": "json",
            },
-            "ppt": {},
            "image": {
-                "parse_method": ["ocr", "vlm"],
+                "parse_method": "ocr",
                "llm_id": "",
                "lang": "Chinese",
+                "system_prompt": "",
                "suffix": ["jpg", "jpeg", "png", "gif"],
-                "output_format": "json",
+                "output_format": "text",
            },
-            "email": {},
-            "text": {
+            "email": {
                "suffix": [
-                    "txt"
+                  "eml", "msg"
                ],
+                "fields": ["from", "to", "cc", "bcc", "date", "subject", "body", "attachments", "metadata"],
                "output_format": "json",
            },
            "audio": {
@ -131,13 +146,10 @@ class ParserParam(ProcessParamBase):
        pdf_config = self.setups.get("pdf", {})
        if pdf_config:
            pdf_parse_method = pdf_config.get("parse_method", "")
-            self.check_valid_value(pdf_parse_method.lower(), "Parse method abnormal.", ["deepdoc", "plain_text", "vlm"])
+            self.check_empty(pdf_parse_method, "Parse method abnormal.")

-            if pdf_parse_method not in ["deepdoc", "plain_text"]:
-                self.check_empty(pdf_config.get("llm_id"), "VLM")
-
-            pdf_language = pdf_config.get("lang", "")
-            self.check_empty(pdf_language, "Language")
+            if pdf_parse_method.lower() not in ["deepdoc", "plain_text"]:
+                self.check_empty(pdf_config.get("lang", ""), "PDF VLM language")

            pdf_output_format = pdf_config.get("output_format", "")
            self.check_valid_value(pdf_output_format, "PDF output format abnormal.", self.allowed_output_format["pdf"])
@ -147,32 +159,38 @@ class ParserParam(ProcessParamBase):
            spreadsheet_output_format = spreadsheet_config.get("output_format", "")
            self.check_valid_value(spreadsheet_output_format, "Spreadsheet output format abnormal.", self.allowed_output_format["spreadsheet"])

-        doc_config = self.setups.get("doc", "")
+        doc_config = self.setups.get("word", "")
        if doc_config:
            doc_output_format = doc_config.get("output_format", "")
-            self.check_valid_value(doc_output_format, "Word processer document output format abnormal.", self.allowed_output_format["doc"])
+            self.check_valid_value(doc_output_format, "Word processer document output format abnormal.", self.allowed_output_format["word"])
+
+        slides_config = self.setups.get("slides", "")
+        if slides_config:
+            slides_output_format = slides_config.get("output_format", "")
+            self.check_valid_value(slides_output_format, "Slides output format abnormal.", self.allowed_output_format["slides"])

        image_config = self.setups.get("image", "")
        if image_config:
            image_parse_method = image_config.get("parse_method", "")
-            self.check_valid_value(image_parse_method.lower(), "Parse method abnormal.", ["ocr", "vlm"])
            if image_parse_method not in ["ocr"]:
-                self.check_empty(image_config.get("llm_id"), "VLM")
+                self.check_empty(image_config.get("lang", ""), "Image VLM language")

-            image_language = image_config.get("lang", "")
-            self.check_empty(image_language, "Language")
-
-        text_config = self.setups.get("text", "")
+        text_config = self.setups.get("text&markdown", "")
        if text_config:
            text_output_format = text_config.get("output_format", "")
-            self.check_valid_value(text_output_format, "Text output format abnormal.", self.allowed_output_format["text"])
+            self.check_valid_value(text_output_format, "Text output format abnormal.", self.allowed_output_format["text&markdown"])

        audio_config = self.setups.get("audio", "")
        if audio_config:
-            self.check_empty(audio_config.get("llm_id"), "VLM")
+            self.check_empty(audio_config.get("llm_id"), "Audio VLM")
            audio_language = audio_config.get("lang", "")
            self.check_empty(audio_language, "Language")

+        email_config = self.setups.get("email", "")
+        if email_config:
+            email_output_format = email_config.get("output_format", "")
+            self.check_valid_value(email_output_format, "Email output format abnormal.", self.allowed_output_format["email"])
+
    def get_input_form(self) -> dict[str, dict]:
        return {}

@ -180,21 +198,18 @@ class ParserParam(ProcessParamBase):
 class Parser(ProcessBase):
    component_name = "Parser"

-    def _pdf(self, from_upstream: ParserFromUpstream):
+    def _pdf(self, name, blob):
        self.callback(random.randint(1, 5) / 100.0, "Start to work on a PDF.")
-
-        blob = from_upstream.blob
        conf = self._param.setups["pdf"]
        self.set_output("output_format", conf["output_format"])

-        if conf.get("parse_method") == "deepdoc":
+        if conf.get("parse_method").lower() == "deepdoc":
            bboxes = RAGFlowPdfParser().parse_into_bboxes(blob, callback=self.callback)
-        elif conf.get("parse_method") == "plain_text":
+        elif conf.get("parse_method").lower() == "plain_text":
            lines, _ = PlainParser()(blob)
            bboxes = [{"text": t} for t, _ in lines]
        else:
-            assert conf.get("llm_id")
-            vision_model = LLMBundle(self._canvas._tenant_id, LLMType.IMAGE2TEXT, llm_name=conf.get("llm_id"), lang=self._param.setups["pdf"].get("lang"))
+            vision_model = LLMBundle(self._canvas._tenant_id, LLMType.IMAGE2TEXT, llm_name=conf.get("parse_method"), lang=self._param.setups["pdf"].get("lang"))
            lines, _ = VisionParser(vision_model=vision_model)(blob, callback=self.callback)
            bboxes = []
            for t, poss in lines:
@ -214,66 +229,63 @@ class Parser(ProcessBase):
                mkdn += b.get("text", "") + "\n"
            self.set_output("markdown", mkdn)

-    def _spreadsheet(self, from_upstream: ParserFromUpstream):
+    def _spreadsheet(self, name, blob):
        self.callback(random.randint(1, 5) / 100.0, "Start to work on a Spreadsheet.")
-
-        blob = from_upstream.blob
        conf = self._param.setups["spreadsheet"]
        self.set_output("output_format", conf["output_format"])
-
-        print("spreadsheet {conf=}", flush=True)
        spreadsheet_parser = ExcelParser()
        if conf.get("output_format") == "html":
-            html = spreadsheet_parser.html(blob, 1000000000)
-            self.set_output("html", html)
+            htmls = spreadsheet_parser.html(blob, 1000000000)
+            self.set_output("html", htmls[0])
        elif conf.get("output_format") == "json":
            self.set_output("json", [{"text": txt} for txt in spreadsheet_parser(blob) if txt])
        elif conf.get("output_format") == "markdown":
            self.set_output("markdown", spreadsheet_parser.markdown(blob))

-    def _word(self, from_upstream: ParserFromUpstream):
-        from tika import parser as  word_parser
-
+    def _word(self, name, blob):
        self.callback(random.randint(1, 5) / 100.0, "Start to work on a Word Processor Document")
-
-        blob = from_upstream.blob
-        name = from_upstream.name
        conf = self._param.setups["word"]
        self.set_output("output_format", conf["output_format"])
-
-        print("word {conf=}", flush=True)
-        doc_parsed = word_parser.from_buffer(blob)
-
-        sections = []
-        if doc_parsed.get("content"):
-            sections = doc_parsed["content"].split("\n")
-            sections = [{"text": section} for section in sections if section]
-        else:
-            logging.warning(f"tika.parser got empty content from {name}.")
-
+        docx_parser = Docx()
+        sections, tbls = docx_parser(name, binary=blob)
+        sections = [{"text": section[0], "image": section[1]} for section in sections if section]
+        sections.extend([{"text": tb, "image": None} for ((_,tb), _) in tbls])
        # json
        assert conf.get("output_format") == "json", "have to be json for doc"
        if conf.get("output_format") == "json":
            self.set_output("json", sections)

-    def _markdown(self, from_upstream: ParserFromUpstream):
+    def _slides(self, name, blob):
+        from deepdoc.parser.ppt_parser import RAGFlowPptParser as ppt_parser
+
+        self.callback(random.randint(1, 5) / 100.0, "Start to work on a PowerPoint Document")
+
+        conf = self._param.setups["slides"]
+        self.set_output("output_format", conf["output_format"])
+
+        ppt_parser = ppt_parser()
+        txts = ppt_parser(blob, 0, 100000, None)
+
+        sections = [{"text": section} for section in txts if section.strip()]
+
+        # json
+        assert conf.get("output_format") == "json", "have to be json for ppt"
+        if conf.get("output_format") == "json":
+            self.set_output("json", sections)
+
+    def _markdown(self, name, blob):
        from functools import reduce

        from rag.app.naive import Markdown as naive_markdown_parser
        from rag.nlp import concat_img

        self.callback(random.randint(1, 5) / 100.0, "Start to work on a markdown.")
-
-        blob = from_upstream.blob
-        name = from_upstream.name
-        conf = self._param.setups["markdown"]
+        conf = self._param.setups["text&markdown"]
        self.set_output("output_format", conf["output_format"])

        markdown_parser = naive_markdown_parser()
        sections, tables = markdown_parser(name, blob, separate_tables=False)

-        # json
-        assert conf.get("output_format") == "json", "have to be json for doc"
        if conf.get("output_format") == "json":
            json_results = []

@ -291,69 +303,51 @@ class Parser(ProcessBase):
                json_results.append(json_result)

            self.set_output("json", json_results)
-
-    def _text(self, from_upstream: ParserFromUpstream):
-        from deepdoc.parser.utils import get_text
-
-        self.callback(random.randint(1, 5) / 100.0, "Start to work on a text.")
-
-        blob = from_upstream.blob
-        name = from_upstream.name
-        conf = self._param.setups["text"]
-        self.set_output("output_format", conf["output_format"])
-
-        # parse binary to text
-        text_content = get_text(name, binary=blob)
-
-        if conf.get("output_format") == "json":
-            result = [{"text": text_content}]
-            self.set_output("json", result)
        else:
-            result = text_content
-            self.set_output("text", result)
+            self.set_output("text", "\n".join([section_text for section_text, _ in sections]))

-    def _image(self, from_upstream: ParserFromUpstream):
+
+    def _image(self, name, blob):
        from deepdoc.vision import OCR

        self.callback(random.randint(1, 5) / 100.0, "Start to work on an image.")
-
-        blob = from_upstream.blob
        conf = self._param.setups["image"]
        self.set_output("output_format", conf["output_format"])

        img = Image.open(io.BytesIO(blob)).convert("RGB")
-        lang = conf["lang"]

        if conf["parse_method"] == "ocr":
            # use ocr, recognize chars only
            ocr = OCR()
            bxs = ocr(np.array(img))  # return boxes and recognize result
            txt = "\n".join([t[0] for _, t in bxs if t[0]])
-
        else:
+            lang = conf["lang"]
            # use VLM to describe the picture
-            cv_model = LLMBundle(self._canvas.get_tenant_id(), LLMType.IMAGE2TEXT, llm_name=conf["llm_id"],lang=lang)
+            cv_model = LLMBundle(self._canvas.get_tenant_id(), LLMType.IMAGE2TEXT, llm_name=conf["parse_method"], lang=lang)
            img_binary = io.BytesIO()
            img.save(img_binary, format="JPEG")
            img_binary.seek(0)
-            txt = cv_model.describe(img_binary.read())
+
+            system_prompt = conf.get("system_prompt")
+            if system_prompt:
+                txt = cv_model.describe_with_prompt(img_binary.read(), system_prompt)
+            else:
+                txt = cv_model.describe(img_binary.read())

        self.set_output("text", txt)

-    def _audio(self, from_upstream: ParserFromUpstream):
+    def _audio(self, name, blob):
        import os
        import tempfile

        self.callback(random.randint(1, 5) / 100.0, "Start to work on an audio.")

-        blob = from_upstream.blob
-        name = from_upstream.name
        conf = self._param.setups["audio"]
        self.set_output("output_format", conf["output_format"])

        lang = conf["lang"]
        _, ext = os.path.splitext(name)
-        tmp_path = ""
        with tempfile.NamedTemporaryFile(suffix=ext) as tmpf:
            tmpf.write(blob)
            tmpf.flush()
@ -364,15 +358,131 @@ class Parser(ProcessBase):

            self.set_output("text", txt)

+    def _email(self, name, blob):
+        self.callback(random.randint(1, 5) / 100.0, "Start to work on an email.")
+
+        email_content = {}
+        conf = self._param.setups["email"]
+        target_fields = conf["fields"]
+
+        _, ext = os.path.splitext(name)
+        if ext == ".eml":
+            # handle eml file
+            from email import policy
+            from email.parser import BytesParser
+
+            msg = BytesParser(policy=policy.default).parse(io.BytesIO(blob))
+            email_content['metadata'] = {}
+            # handle header info
+            for header, value in msg.items():
+                # get fields like from, to, cc, bcc, date, subject
+                if header.lower() in target_fields:
+                    email_content[header.lower()] = value
+                # get metadata
+                elif header.lower() not in ["from", "to", "cc", "bcc", "date", "subject"]:
+                    email_content["metadata"][header.lower()] = value
+            # get body
+            if "body" in target_fields:
+                body_text, body_html = [], []
+                def _add_content(m, content_type):
+                    if content_type == "text/plain":
+                        body_text.append(
+                            m.get_payload(decode=True).decode(m.get_content_charset())
+                        )
+                    elif content_type == "text/html":
+                        body_html.append(
+                            m.get_payload(decode=True).decode(m.get_content_charset())
+                        )
+                    elif "multipart" in content_type:
+                        if m.is_multipart():
+                            for part in m.iter_parts():
+                                _add_content(part, part.get_content_type())
+
+                _add_content(msg, msg.get_content_type())
+
+                email_content["text"] = body_text
+                email_content["text_html"] = body_html
+            # get attachment
+            if "attachments" in target_fields:
+                attachments = []
+                for part in msg.iter_attachments():
+                    content_disposition = part.get("Content-Disposition")
+                    if content_disposition:
+                        dispositions = content_disposition.strip().split(";")
+                        if dispositions[0].lower() == "attachment":
+                            filename = part.get_filename()
+                            payload = part.get_payload(decode=True)
+                            attachments.append({
+                                "filename": filename,
+                                "payload": payload,
+                            })
+                email_content["attachments"] = attachments
+        else:
+            # handle msg file
+            import extract_msg
+            print("handle a msg file.")
+            msg = extract_msg.Message(blob)
+            # handle header info
+            basic_content = {
+                "from": msg.sender,
+                "to": msg.to,
+                "cc": msg.cc,
+                "bcc": msg.bcc,
+                "date": msg.date,
+                "subject": msg.subject,
+            }
+            email_content.update({k: v for k, v in basic_content.items() if k in target_fields})
+            # get metadata
+            email_content['metadata'] = {
+                'message_id': msg.messageId,
+                'in_reply_to': msg.inReplyTo,
+            }
+            # get body
+            if "body" in target_fields:
+                email_content["text"] = msg.body  # usually empty. try text_html instead
+                email_content["text_html"] = msg.htmlBody
+            # get attachments
+            if "attachments" in target_fields:
+                attachments = []
+                for t in msg.attachments:
+                    attachments.append({
+                        "filename": t.name,
+                        "payload": t.data  # binary
+                    })
+                email_content["attachments"] = attachments
+
+        if conf["output_format"] == "json":
+            self.set_output("json", [email_content])
+        else:
+            content_txt = ''
+            for k, v in email_content.items():
+                if isinstance(v, str):
+                    # basic info
+                    content_txt += f'{k}:{v}' + "\n"
+                elif isinstance(v, dict):
+                    # metadata
+                    content_txt += f'{k}:{json.dumps(v)}' + "\n"
+                elif isinstance(v, list):
+                    # attachments or others
+                    for fb in v:
+                        if isinstance(fb, dict):
+                            # attachments
+                            content_txt += f'{fb["filename"]}:{fb["payload"]}' + "\n"
+                        else:
+                            # str, usually plain text
+                            content_txt += fb
+            self.set_output("text", content_txt)
+
    async def _invoke(self, **kwargs):
        function_map = {
            "pdf": self._pdf,
-            "markdown": self._markdown,
+            "text&markdown": self._markdown,
            "spreadsheet": self._spreadsheet,
+            "slides": self._slides,
            "word": self._word,
-            "text": self._text,
            "image": self._image,
            "audio": self._audio,
+            "email": self._email,
        }
        try:
            from_upstream = ParserFromUpstream.model_validate(kwargs)
@ -380,8 +490,25 @@ class Parser(ProcessBase):
            self.set_output("_ERROR", f"Input error: {str(e)}")
            return

+        name = from_upstream.name
+        if self._canvas._doc_id:
+            b, n = File2DocumentService.get_storage_address(doc_id=self._canvas._doc_id)
+            blob = STORAGE_IMPL.get(b, n)
+        else:
+            blob = FileService.get_blob(from_upstream.file["created_by"], from_upstream.file["id"])
+
+        done = False
        for p_type, conf in self._param.setups.items():
            if from_upstream.name.split(".")[-1].lower() not in conf.get("suffix", []):
                continue
-            await trio.to_thread.run_sync(function_map[p_type], from_upstream)
+            await trio.to_thread.run_sync(function_map[p_type], name, blob)
+            done = True
            break
+
+        if not done:
+            raise Exception("No suitable for file extension: `.%s`" % from_upstream.name.split(".")[-1].lower())
+
+        outs = self.output()
+        async with trio.open_nursery() as nursery:
+            for d in outs.get("json", []):
+                nursery.start_soon(image2id, d, partial(STORAGE_IMPL.put), get_uuid())
--- a/rag/flow/parser/schema.py
+++ b/rag/flow/parser/schema.py
@ -20,6 +20,5 @@ class ParserFromUpstream(BaseModel):
    elapsed_time: float | None = Field(default=None, alias="_elapsed_time")

    name: str
-    blob: bytes
-
+    file: dict | None = Field(default=None)
    model_config = ConfigDict(populate_by_name=True, extra="forbid")