Feat: Add TCADP parser for PPTX and spreadsheet document types. (#11041)

### What problem does this PR solve? - Added TCADP Parser configuration fields to PDF, PPT, and spreadsheet parsing forms - Implemented support for setting table result type (Markdown/HTML) and Markdown image response type (URL/Text) - Updated TCADP Parser to handle return format settings from configuration or parameters - Enhanced frontend to dynamically show TCADP options based on selected parsing method - Modified backend to pass format parameters when calling TCADP API - Optimized form default value logic for TCADP configuration items - Updated multilingual resource files for new configuration options ### Type of change - [x] New Feature (non-breaking change which adds functionality)
2026-01-31 23:55:06 +08:00 · 2025-11-20 10:08:42 +08:00
parent ecf0322165
commit 420c97199a
18 changed files with 668 additions and 37 deletions
--- a/rag/app/naive.py
+++ b/rag/app/naive.py
@ -116,7 +116,7 @@ def by_plaintext(filename, binary=None, from_page=0, to_page=100000, callback=No
    else:
        vision_model = LLMBundle(kwargs["tenant_id"], LLMType.IMAGE2TEXT, llm_name=kwargs.get("layout_recognizer", ""), lang=kwargs.get("lang", "Chinese"))
        pdf_parser = VisionParser(vision_model=vision_model, **kwargs)
-    
+
    sections, tables = pdf_parser(
        filename if not binary else binary,
        from_page=from_page,
@ -504,7 +504,7 @@ class Markdown(MarkdownParser):

        return images if images else None

-    def __call__(self, filename, binary=None, separate_tables=True,delimiter=None):
+    def __call__(self, filename, binary=None, separate_tables=True, delimiter=None):
        if binary:
            encoding = find_codec(binary)
            txt = binary.decode(encoding, errors="ignore")
@ -602,7 +602,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
        _SerializedRelationships.load_from_xml = load_from_xml_v2
        sections, tables = Docx()(filename, binary)

-        tables=vision_figure_parser_docx_wrapper(sections=sections,tbls=tables,callback=callback,**kwargs)
+        tables = vision_figure_parser_docx_wrapper(sections=sections, tbls=tables, callback=callback, **kwargs)

        res = tokenize_table(tables, doc, is_english)
        callback(0.8, "Finish parsing.")
@ -653,18 +653,47 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,

        if name in ["tcadp", "docling", "mineru"]:
            parser_config["chunk_token_num"] = 0
-        
+
        res = tokenize_table(tables, doc, is_english)
        callback(0.8, "Finish parsing.")

    elif re.search(r"\.(csv|xlsx?)$", filename, re.IGNORECASE):
        callback(0.1, "Start to parse.")
-        excel_parser = ExcelParser()
-        if parser_config.get("html4excel"):
-            sections = [(_, "") for _ in excel_parser.html(binary, 12) if _]
+
+        # Check if tcadp_parser is selected for spreadsheet files
+        layout_recognizer = parser_config.get("layout_recognize", "DeepDOC")
+        if layout_recognizer == "TCADP Parser":
+            table_result_type = parser_config.get("table_result_type", "1")
+            markdown_image_response_type = parser_config.get("markdown_image_response_type", "1")
+            tcadp_parser = TCADPParser(
+                table_result_type=table_result_type,
+                markdown_image_response_type=markdown_image_response_type
+            )
+            if not tcadp_parser.check_installation():
+                callback(-1, "TCADP parser not available. Please check Tencent Cloud API configuration.")
+                return res
+
+            # Determine file type based on extension
+            file_type = "XLSX" if re.search(r"\.xlsx?$", filename, re.IGNORECASE) else "CSV"
+
+            sections, tables = tcadp_parser.parse_pdf(
+                filepath=filename,
+                binary=binary,
+                callback=callback,
+                output_dir=os.environ.get("TCADP_OUTPUT_DIR", ""),
+                file_type=file_type
+            )
+            parser_config["chunk_token_num"] = 0
+            res = tokenize_table(tables, doc, is_english)
+            callback(0.8, "Finish parsing.")
        else:
-            sections = [(_, "") for _ in excel_parser(binary) if _]
-        parser_config["chunk_token_num"] = 12800
+            # Default DeepDOC parser
+            excel_parser = ExcelParser()
+            if parser_config.get("html4excel"):
+                sections = [(_, "") for _ in excel_parser.html(binary, 12) if _]
+            else:
+                sections = [(_, "") for _ in excel_parser(binary) if _]
+            parser_config["chunk_token_num"] = 12800

    elif re.search(r"\.(txt|py|js|java|c|cpp|h|php|go|ts|sh|cs|kt|sql)$", filename, re.IGNORECASE):
        callback(0.1, "Start to parse.")
@ -676,7 +705,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
    elif re.search(r"\.(md|markdown)$", filename, re.IGNORECASE):
        callback(0.1, "Start to parse.")
        markdown_parser = Markdown(int(parser_config.get("chunk_token_num", 128)))
-        sections, tables = markdown_parser(filename, binary, separate_tables=False,delimiter=parser_config.get("delimiter", "\n!?;。；！？"))
+        sections, tables = markdown_parser(filename, binary, separate_tables=False, delimiter=parser_config.get("delimiter", "\n!?;。；！？"))

        try:
            vision_model = LLMBundle(kwargs["tenant_id"], LLMType.IMAGE2TEXT)
--- a/rag/flow/parser/parser.py
+++ b/rag/flow/parser/parser.py
@ -16,6 +16,7 @@ import io
 import json
 import os
 import random
+import re
 from functools import partial

 import trio
@ -83,6 +84,7 @@ class ParserParam(ProcessParamBase):
                "output_format": "json",
            },
            "spreadsheet": {
+                "parse_method": "deepdoc",  # deepdoc/tcadp_parser
                "output_format": "html",
                "suffix": [
                    "xls",
@ -102,8 +104,10 @@ class ParserParam(ProcessParamBase):
                "output_format": "json",
            },
            "slides": {
+                "parse_method": "deepdoc",  # deepdoc/tcadp_parser
                "suffix": [
                    "pptx",
+                    "ppt"
                ],
                "output_format": "json",
            },
@ -245,7 +249,12 @@ class Parser(ProcessBase):
                bboxes.append(box)
        elif conf.get("parse_method").lower() == "tcadp parser":
            # ADP is a document parsing tool using Tencent Cloud API
-            tcadp_parser = TCADPParser()
+            table_result_type = conf.get("table_result_type", "1")
+            markdown_image_response_type = conf.get("markdown_image_response_type", "1")
+            tcadp_parser = TCADPParser(
+                table_result_type=table_result_type,
+                markdown_image_response_type=markdown_image_response_type
+            )
            sections, _ = tcadp_parser.parse_pdf(
                filepath=name,
                binary=blob,
@ -301,14 +310,86 @@ class Parser(ProcessBase):
        self.callback(random.randint(1, 5) / 100.0, "Start to work on a Spreadsheet.")
        conf = self._param.setups["spreadsheet"]
        self.set_output("output_format", conf["output_format"])
-        spreadsheet_parser = ExcelParser()
-        if conf.get("output_format") == "html":
-            htmls = spreadsheet_parser.html(blob, 1000000000)
-            self.set_output("html", htmls[0])
-        elif conf.get("output_format") == "json":
-            self.set_output("json", [{"text": txt} for txt in spreadsheet_parser(blob) if txt])
-        elif conf.get("output_format") == "markdown":
-            self.set_output("markdown", spreadsheet_parser.markdown(blob))
+
+        parse_method = conf.get("parse_method", "deepdoc")
+
+        # Handle TCADP parser
+        if parse_method.lower() == "tcadp parser":
+            table_result_type = conf.get("table_result_type", "1")
+            markdown_image_response_type = conf.get("markdown_image_response_type", "1")
+            tcadp_parser = TCADPParser(
+                table_result_type=table_result_type,
+                markdown_image_response_type=markdown_image_response_type
+            )
+            if not tcadp_parser.check_installation():
+                raise RuntimeError("TCADP parser not available. Please check Tencent Cloud API configuration.")
+
+            # Determine file type based on extension
+            if re.search(r"\.xlsx?$", name, re.IGNORECASE):
+                file_type = "XLSX"
+            else:
+                file_type = "CSV"
+
+            self.callback(0.2, f"Using TCADP parser for {file_type} file.")
+            sections, tables = tcadp_parser.parse_pdf(
+                filepath=name,
+                binary=blob,
+                callback=self.callback,
+                file_type=file_type,
+                file_start_page=1,
+                file_end_page=1000
+            )
+
+            # Process TCADP parser output based on configured output_format
+            output_format = conf.get("output_format", "html")
+
+            if output_format == "html":
+                # For HTML output, combine sections and tables into HTML
+                html_content = ""
+                for section, position_tag in sections:
+                    if section:
+                        html_content += section + "\n"
+                for table in tables:
+                    if table:
+                        html_content += table + "\n"
+
+                self.set_output("html", html_content)
+
+            elif output_format == "json":
+                # For JSON output, create a list of text items
+                result = []
+                # Add sections as text
+                for section, position_tag in sections:
+                    if section:
+                        result.append({"text": section})
+                # Add tables as text
+                for table in tables:
+                    if table:
+                        result.append({"text": table})
+
+                self.set_output("json", result)
+
+            elif output_format == "markdown":
+                # For markdown output, combine into markdown
+                md_content = ""
+                for section, position_tag in sections:
+                    if section:
+                        md_content += section + "\n\n"
+                for table in tables:
+                    if table:
+                        md_content += table + "\n\n"
+
+                self.set_output("markdown", md_content)
+        else:
+            # Default DeepDOC parser
+            spreadsheet_parser = ExcelParser()
+            if conf.get("output_format") == "html":
+                htmls = spreadsheet_parser.html(blob, 1000000000)
+                self.set_output("html", htmls[0])
+            elif conf.get("output_format") == "json":
+                self.set_output("json", [{"text": txt} for txt in spreadsheet_parser(blob) if txt])
+            elif conf.get("output_format") == "markdown":
+                self.set_output("markdown", spreadsheet_parser.markdown(blob))

    def _word(self, name, blob):
        self.callback(random.randint(1, 5) / 100.0, "Start to work on a Word Processor Document")
@ -326,22 +407,69 @@ class Parser(ProcessBase):
            self.set_output("markdown", markdown_text)

    def _slides(self, name, blob):
-        from deepdoc.parser.ppt_parser import RAGFlowPptParser as ppt_parser
-
        self.callback(random.randint(1, 5) / 100.0, "Start to work on a PowerPoint Document")

        conf = self._param.setups["slides"]
        self.set_output("output_format", conf["output_format"])

-        ppt_parser = ppt_parser()
-        txts = ppt_parser(blob, 0, 100000, None)
+        parse_method = conf.get("parse_method", "deepdoc")

-        sections = [{"text": section} for section in txts if section.strip()]
+        # Handle TCADP parser
+        if parse_method.lower() == "tcadp parser":
+            table_result_type = conf.get("table_result_type", "1")
+            markdown_image_response_type = conf.get("markdown_image_response_type", "1")
+            tcadp_parser = TCADPParser(
+                table_result_type=table_result_type,
+                markdown_image_response_type=markdown_image_response_type
+            )
+            if not tcadp_parser.check_installation():
+                raise RuntimeError("TCADP parser not available. Please check Tencent Cloud API configuration.")

-        # json
-        assert conf.get("output_format") == "json", "have to be json for ppt"
-        if conf.get("output_format") == "json":
-            self.set_output("json", sections)
+            # Determine file type based on extension
+            if re.search(r"\.pptx?$", name, re.IGNORECASE):
+                file_type = "PPTX"
+            else:
+                file_type = "PPT"
+
+            self.callback(0.2, f"Using TCADP parser for {file_type} file.")
+
+            sections, tables = tcadp_parser.parse_pdf(
+                filepath=name,
+                binary=blob,
+                callback=self.callback,
+                file_type=file_type,
+                file_start_page=1,
+                file_end_page=1000
+            )
+
+            # Process TCADP parser output - PPT only supports json format
+            output_format = conf.get("output_format", "json")
+            if output_format == "json":
+                # For JSON output, create a list of text items
+                result = []
+                # Add sections as text
+                for section, position_tag in sections:
+                    if section:
+                        result.append({"text": section})
+                # Add tables as text
+                for table in tables:
+                    if table:
+                        result.append({"text": table})
+
+                self.set_output("json", result)
+        else:
+            # Default DeepDOC parser (supports .pptx format)
+            from deepdoc.parser.ppt_parser import RAGFlowPptParser as ppt_parser
+
+            ppt_parser = ppt_parser()
+            txts = ppt_parser(blob, 0, 100000, None)
+
+            sections = [{"text": section} for section in txts if section.strip()]
+
+            # json
+            assert conf.get("output_format") == "json", "have to be json for ppt"
+            if conf.get("output_format") == "json":
+                self.set_output("json", sections)

    def _markdown(self, name, blob):
        from functools import reduce
@ -579,6 +707,7 @@ class Parser(ProcessBase):
            "video": self._video,
            "email": self._email,
        }
+
        try:
            from_upstream = ParserFromUpstream.model_validate(kwargs)
        except Exception as e: