Feat: add TCADP Parser (#10775)

### What problem does this PR solve? This PR adds a new TCADP (Tencent Cloud Advanced Document Processing) parser to RAGFlow, enabling users to leverage Tencent Cloud's document parsing capabilities for more accurate and structured document processing. The implementation includes: New TCADP Parser: A complete implementation of Tencent Cloud's document parsing API without SDK dependency Configuration Support: Added configuration options in service_conf.yaml for Tencent Cloud API credentials Frontend Integration: Updated UI components to support the new TCADP parser option Error Handling: Comprehensive error handling and retry mechanisms for API calls Result Processing: Support for both SSE streaming and JSON response formats from Tencent Cloud API ### Type of change - [x] New Feature (non-breaking change which adds functionality) --------- Co-authored-by: Kevin Hu <kevinhu.sh@gmail.com>
2026-01-30 23:26:36 +08:00 · 2025-10-27 15:14:58 +08:00
parent 56def59c2b
commit 33a189f620
10 changed files with 579 additions and 9 deletions
--- a/rag/app/naive.py
+++ b/rag/app/naive.py
@ -36,6 +36,7 @@ from deepdoc.parser.figure_parser import VisionFigureParser,vision_figure_parser
 from deepdoc.parser.pdf_parser import PlainParser, VisionParser
 from deepdoc.parser.mineru_parser import MinerUParser
 from deepdoc.parser.docling_parser import DoclingParser
+from deepdoc.parser.tcadp_parser import TCADPParser
 from rag.nlp import concat_img, find_codec, naive_merge, naive_merge_with_images, naive_merge_docx, rag_tokenizer, tokenize_chunks, tokenize_chunks_with_images, tokenize_table


@ -550,7 +551,23 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
            parser_config["chunk_token_num"] = 0
            res = tokenize_table(tables, doc, is_english)
            callback(0.8, "Finish parsing.")
-        
+
+
+        elif layout_recognizer == "TCADP Parser":
+            tcadp_parser = TCADPParser()
+            if not tcadp_parser.check_installation():
+                callback(-1, "TCADP parser not available. Please check Tencent Cloud API configuration.")
+                return res
+
+            sections, tables = tcadp_parser.parse_pdf(
+                filepath=filename,
+                binary=binary,
+                callback=callback,
+                output_dir=os.environ.get("TCADP_OUTPUT_DIR", ""),
+                file_type="PDF"
+            )
+            parser_config["chunk_token_num"] = 0
+            callback(0.8, "Finish parsing.")
        else:
            if layout_recognizer == "Plain Text":
                pdf_parser = PlainParser()
--- a/rag/flow/parser/parser.py
+++ b/rag/flow/parser/parser.py
@ -31,6 +31,7 @@ from api.utils.base64_image import image2id
 from deepdoc.parser import ExcelParser
 from deepdoc.parser.mineru_parser import MinerUParser
 from deepdoc.parser.pdf_parser import PlainParser, RAGFlowPdfParser, VisionParser
+from deepdoc.parser.tcadp_parser import TCADPParser
 from rag.app.naive import Docx
 from rag.flow.base import ProcessBase, ProcessParamBase
 from rag.flow.parser.schema import ParserFromUpstream
@ -74,7 +75,7 @@ class ParserParam(ProcessParamBase):

        self.setups = {
            "pdf": {
-                "parse_method": "deepdoc",  # deepdoc/plain_text/vlm
+                "parse_method": "deepdoc",  # deepdoc/plain_text/tcadp_parser/vlm
                "lang": "Chinese",
                "suffix": [
                    "pdf",
@ -157,7 +158,7 @@ class ParserParam(ProcessParamBase):
            pdf_parse_method = pdf_config.get("parse_method", "")
            self.check_empty(pdf_parse_method, "Parse method abnormal.")

-            if pdf_parse_method.lower() not in ["deepdoc", "plain_text", "mineru"]:
+            if pdf_parse_method.lower() not in ["deepdoc", "plain_text", "mineru", "tcadp parser"]:
                self.check_empty(pdf_config.get("lang", ""), "PDF VLM language")

            pdf_output_format = pdf_config.get("output_format", "")
@ -240,6 +241,39 @@ class Parser(ProcessBase):
                    "text": t,
                }
                bboxes.append(box)
+        elif conf.get("parse_method").lower() == "tcadp parser":
+            # ADP is a document parsing tool using Tencent Cloud API
+            tcadp_parser = TCADPParser()
+            sections, _ = tcadp_parser.parse_pdf(
+                filepath=name,
+                binary=blob,
+                callback=self.callback,
+                file_type="PDF",
+                file_start_page=1,
+                file_end_page=1000
+            )
+            bboxes = []
+            for section, position_tag in sections:
+                if position_tag:
+                    # Extract position information from TCADP's position tag
+                    # Format: @@{page_number}\t{x0}\t{x1}\t{top}\t{bottom}##
+                    import re
+                    match = re.match(r"@@([0-9-]+)\t([0-9.]+)\t([0-9.]+)\t([0-9.]+)\t([0-9.]+)##", position_tag)
+                    if match:
+                        pn, x0, x1, top, bott = match.groups()
+                        bboxes.append({
+                            "page_number": int(pn.split('-')[0]),  # Take the first page number
+                            "x0": float(x0),
+                            "x1": float(x1),
+                            "top": float(top),
+                            "bottom": float(bott),
+                            "text": section
+                        })
+                    else:
+                        # If no position info, add as text without position
+                        bboxes.append({"text": section})
+                else:
+                    bboxes.append({"text": section})
        else:
            vision_model = LLMBundle(self._canvas._tenant_id, LLMType.IMAGE2TEXT, llm_name=conf.get("parse_method"), lang=self._param.setups["pdf"].get("lang"))
            lines, _ = VisionParser(vision_model=vision_model)(blob, callback=self.callback)