Feat: dataflow supports Spreadsheet and Word processor document (#9996)

### What problem does this PR solve? Dataflow supports Spreadsheet and Word processor document ### Type of change - [x] New Feature (non-breaking change which adds functionality)
2026-02-01 16:15:07 +08:00 · 2025-09-10 13:02:53 +08:00
parent e650f0d368
commit 0d9c1f1c3c
9 changed files with 126 additions and 43 deletions
--- a/rag/flow/chunker/chunker.py
+++ b/rag/flow/chunker/chunker.py
@ -73,11 +73,13 @@ class Chunker(ProcessBase):

    def _general(self, from_upstream: ChunkerFromUpstream):
        self.callback(random.randint(1, 5) / 100.0, "Start to chunk via `General`.")
-        if from_upstream.output_format in ["markdown", "text"]:
+        if from_upstream.output_format in ["markdown", "text", "html"]:
            if from_upstream.output_format == "markdown":
                payload = from_upstream.markdown_result
-            else:  # == "text"
+            elif from_upstream.output_format == "text":
                payload = from_upstream.text_result
+            else:  # == "html"
+                payload = from_upstream.html_result

            if not payload:
                payload = ""
@ -90,6 +92,7 @@ class Chunker(ProcessBase):
            )
            return [{"text": c} for c in cks]

+        # json
        sections, section_images = [], []
        for o in from_upstream.json_result or []:
            sections.append((o.get("text", ""), o.get("position_tag", "")))
--- a/rag/flow/chunker/schema.py
+++ b/rag/flow/chunker/schema.py
@ -29,7 +29,7 @@ class ChunkerFromUpstream(BaseModel):
    json_result: list[dict[str, Any]] | None = Field(default=None, alias="json")
    markdown_result: str | None = Field(default=None, alias="markdown")
    text_result: str | None = Field(default=None, alias="text")
-    html_result: str | None = Field(default=None, alias="html")
+    html_result: list[str] | None = Field(default=None, alias="html")

    model_config = ConfigDict(populate_by_name=True, extra="forbid")

--- a/rag/flow/parser/parser.py
+++ b/rag/flow/parser/parser.py
@ -12,6 +12,7 @@
 #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
+import logging
 import random

 import trio
@ -29,8 +30,18 @@ class ParserParam(ProcessParamBase):
    def __init__(self):
        super().__init__()
        self.allowed_output_format = {
-            "pdf": ["json", "markdown"],
-            "excel": ["json", "markdown", "html"],
+            "pdf": [
+                "json",
+                "markdown",
+            ],
+            "spreadsheet": [
+                "json",
+                "markdown",
+                "html",
+            ],
+            "word": [
+                "json",
+            ],
            "ppt": [],
            "image": [],
            "email": [],
@ -44,12 +55,25 @@ class ParserParam(ProcessParamBase):
                "parse_method": "deepdoc",  # deepdoc/plain_text/vlm
                "vlm_name": "",
                "lang": "Chinese",
-                "suffix": ["pdf"],
+                "suffix": [
+                    "pdf",
+                ],
                "output_format": "json",
            },
-            "excel": {
+            "spreadsheet": {
                "output_format": "html",
-                "suffix": ["xls", "xlsx", "csv"],
+                "suffix": [
+                    "xls",
+                    "xlsx",
+                    "csv",
+                ],
+            },
+            "word": {
+                "suffix": [
+                    "doc",
+                    "docx",
+                ],
+                "output_format": "json",
            },
            "ppt": {},
            "image": {
@ -76,10 +100,15 @@ class ParserParam(ProcessParamBase):
            pdf_output_format = pdf_config.get("output_format", "")
            self.check_valid_value(pdf_output_format, "PDF output format abnormal.", self.allowed_output_format["pdf"])

-        excel_config = self.setups.get("excel", "")
-        if excel_config:
-            excel_output_format = excel_config.get("output_format", "")
-            self.check_valid_value(excel_output_format, "Excel output format abnormal.", self.allowed_output_format["excel"])
+        spreadsheet_config = self.setups.get("spreadsheet", "")
+        if spreadsheet_config:
+            spreadsheet_output_format = spreadsheet_config.get("output_format", "")
+            self.check_valid_value(spreadsheet_output_format, "Spreadsheet output format abnormal.", self.allowed_output_format["spreadsheet"])
+
+        doc_config = self.setups.get("doc", "")
+        if doc_config:
+            doc_output_format = doc_config.get("output_format", "")
+            self.check_valid_value(doc_output_format, "Word processer document output format abnormal.", self.allowed_output_format["doc"])

        image_config = self.setups.get("image", "")
        if image_config:
@ -93,10 +122,13 @@ class ParserParam(ProcessParamBase):
 class Parser(ProcessBase):
    component_name = "Parser"

-    def _pdf(self, blob):
+    def _pdf(self, from_upstream: ParserFromUpstream):
        self.callback(random.randint(1, 5) / 100.0, "Start to work on a PDF.")
+
+        blob = from_upstream.blob
        conf = self._param.setups["pdf"]
        self.set_output("output_format", conf["output_format"])
+
        if conf.get("parse_method") == "deepdoc":
            bboxes = RAGFlowPdfParser().parse_into_bboxes(blob, callback=self.callback)
        elif conf.get("parse_method") == "plain_text":
@ -110,6 +142,7 @@ class Parser(ProcessBase):
            for t, poss in lines:
                pn, x0, x1, top, bott = poss.split(" ")
                bboxes.append({"page_number": int(pn), "x0": float(x0), "x1": float(x1), "top": float(top), "bottom": float(bott), "text": t})
+
        if conf.get("output_format") == "json":
            self.set_output("json", bboxes)
        if conf.get("output_format") == "markdown":
@ -123,23 +156,53 @@ class Parser(ProcessBase):
                mkdn += b.get("text", "") + "\n"
            self.set_output("markdown", mkdn)

-    def _excel(self, blob):
-        self.callback(random.randint(1, 5) / 100.0, "Start to work on a Excel.")
-        conf = self._param.setups["excel"]
+    def _spreadsheet(self, from_upstream: ParserFromUpstream):
+        self.callback(random.randint(1, 5) / 100.0, "Start to work on a Spreadsheet.")
+
+        blob = from_upstream.blob
+        conf = self._param.setups["spreadsheet"]
        self.set_output("output_format", conf["output_format"])
-        excel_parser = ExcelParser()
+
+        print("spreadsheet {conf=}", flush=True)
+        spreadsheet_parser = ExcelParser()
        if conf.get("output_format") == "html":
-            html = excel_parser.html(blob, 1000000000)
+            html = spreadsheet_parser.html(blob, 1000000000)
            self.set_output("html", html)
        elif conf.get("output_format") == "json":
-            self.set_output("json", [{"text": txt} for txt in excel_parser(blob) if txt])
+            self.set_output("json", [{"text": txt} for txt in spreadsheet_parser(blob) if txt])
        elif conf.get("output_format") == "markdown":
-            self.set_output("markdown", excel_parser.markdown(blob))
+            self.set_output("markdown", spreadsheet_parser.markdown(blob))
+
+    def _word(self, from_upstream: ParserFromUpstream):
+        from tika import parser as  word_parser
+
+        self.callback(random.randint(1, 5) / 100.0, "Start to work on a Word Processor Document")
+
+        blob = from_upstream.blob
+        name = from_upstream.name
+        conf = self._param.setups["word"]
+        self.set_output("output_format", conf["output_format"])
+
+        print("word {conf=}", flush=True)
+        doc_parsed = word_parser.from_buffer(blob)
+
+        sections = []
+        if doc_parsed.get("content"):
+            sections = doc_parsed["content"].split("\n")
+            sections = [{"text": section} for section in sections if section]
+        else:
+            logging.warning(f"tika.parser got empty content from {name}.")
+
+        # json
+        assert conf.get("output_format") == "json", "have to be json for doc"
+        if conf.get("output_format") == "json":
+            self.set_output("json", sections)

    async def _invoke(self, **kwargs):
        function_map = {
            "pdf": self._pdf,
-            "excel": self._excel,
+            "spreadsheet": self._spreadsheet,
+            "word": self._word,
        }
        try:
            from_upstream = ParserFromUpstream.model_validate(kwargs)
@ -150,5 +213,5 @@ class Parser(ProcessBase):
        for p_type, conf in self._param.setups.items():
            if from_upstream.name.split(".")[-1].lower() not in conf.get("suffix", []):
                continue
-            await trio.to_thread.run_sync(function_map[p_type], from_upstream.blob)
+            await trio.to_thread.run_sync(function_map[p_type], from_upstream)
            break
--- a/rag/flow/tests/dsl_examples/general_pdf_all.json
+++ b/rag/flow/tests/dsl_examples/general_pdf_all.json
@ -23,13 +23,20 @@
                  ],
                  "output_format": "json"
                },
-                "excel": {
-                  "output_format": "html",
+                "spreadsheet": {
                  "suffix": [
                    "xls",
                    "xlsx",
                    "csv"
-                  ]
+                  ],
+                  "output_format": "html"
+                },
+                "word": {
+                  "suffix": [
+                    "doc",
+                    "docx"
+                  ],
+                  "output_format": "json"
                }
              }
            }
--- a/rag/flow/tokenizer/schema.py
+++ b/rag/flow/tokenizer/schema.py
@ -31,7 +31,7 @@ class TokenizerFromUpstream(BaseModel):
    json_result: list[dict[str, Any]] | None = Field(default=None, alias="json")
    markdown_result: str | None = Field(default=None, alias="markdown")
    text_result: str | None = Field(default=None, alias="text")
-    html_result: str | None = Field(default=None, alias="html")
+    html_result: list[str] | None = Field(default=None, alias="html")

    model_config = ConfigDict(populate_by_name=True, extra="forbid")

--- a/rag/flow/tokenizer/tokenizer.py
+++ b/rag/flow/tokenizer/tokenizer.py
@ -117,11 +117,13 @@ class Tokenizer(ProcessBase):
                    ck["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(ck["content_ltks"])
                    if i % 100 == 99:
                        self.callback(i * 1.0 / len(chunks) / parts)
-            elif from_upstream.output_format in ["markdown", "text"]:
+            elif from_upstream.output_format in ["markdown", "text", "html"]:
                if from_upstream.output_format == "markdown":
                    payload = from_upstream.markdown_result
-                else:  # == "text"
+                elif from_upstream.output_format == "text":
                    payload = from_upstream.text_result
+                else:  # == "html"
+                    payload = from_upstream.html_result

                if not payload:
                    return ""
--- a/rag/llm/embedding_model.py
+++ b/rag/llm/embedding_model.py
@ -751,6 +751,8 @@ class SILICONFLOWEmbed(Base):
        token_count = 0
        for i in range(0, len(texts), batch_size):
            texts_batch = texts[i : i + batch_size]
+            texts_batch = [" " if not text.strip() else text for text in texts_batch]
+
            payload = {
                "model": self.model_name,
                "input": texts_batch,
@ -935,7 +937,7 @@ class GiteeEmbed(SILICONFLOWEmbed):
        if not base_url:
            base_url = "https://ai.gitee.com/v1/embeddings"
        super().__init__(key, model_name, base_url)
-        
+
 class DeepInfraEmbed(OpenAIEmbed):
    _FACTORY_NAME = "DeepInfra"

@ -951,4 +953,4 @@ class Ai302Embed(Base):
    def __init__(self, key, model_name, base_url="https://api.302.ai/v1/embeddings"):
        if not base_url:
            base_url = "https://api.302.ai/v1/embeddings"
-        super().__init__(key, model_name, base_url)
+        super().__init__(key, model_name, base_url)
--- a/rag/nlp/init.py
+++ b/rag/nlp/init.py
@ -518,7 +518,7 @@ def hierarchical_merge(bull, sections, depth):
    return res


-def naive_merge(sections, chunk_token_num=128, delimiter="\n。；！？", overlapped_percent=0):
+def naive_merge(sections: str | list, chunk_token_num=128, delimiter="\n。；！？", overlapped_percent=0):
    from deepdoc.parser.pdf_parser import RAGFlowPdfParser
    if not sections:
        return []
@ -534,7 +534,7 @@ def naive_merge(sections, chunk_token_num=128, delimiter="\n。；！？", overl
            pos = ""
        if tnum < 8:
            pos = ""
-        # Ensure that the length of the merged chunk does not exceed chunk_token_num  
+        # Ensure that the length of the merged chunk does not exceed chunk_token_num
        if cks[-1] == "" or tk_nums[-1] > chunk_token_num * (100 - overlapped_percent)/100.:
            if cks:
                overlapped = RAGFlowPdfParser.remove_tag(cks[-1])
@ -638,10 +638,10 @@ def concat_img(img1, img2):
        return img2
    if not img1 and not img2:
        return None
-    
+
    if img1 is img2:
        return img1
-    
+
    if isinstance(img1, Image.Image) and isinstance(img2, Image.Image):
        pixel_data1 = img1.tobytes()
        pixel_data2 = img2.tobytes()