Feat: add TCADP Parser (#10775)

### What problem does this PR solve? This PR adds a new TCADP (Tencent Cloud Advanced Document Processing) parser to RAGFlow, enabling users to leverage Tencent Cloud's document parsing capabilities for more accurate and structured document processing. The implementation includes: New TCADP Parser: A complete implementation of Tencent Cloud's document parsing API without SDK dependency Configuration Support: Added configuration options in service_conf.yaml for Tencent Cloud API credentials Frontend Integration: Updated UI components to support the new TCADP parser option Error Handling: Comprehensive error handling and retry mechanisms for API calls Result Processing: Support for both SSE streaming and JSON response formats from Tencent Cloud API ### Type of change - [x] New Feature (non-breaking change which adds functionality) --------- Co-authored-by: Kevin Hu <kevinhu.sh@gmail.com>
2026-01-31 23:55:06 +08:00 · 2025-10-27 15:14:58 +08:00
parent 56def59c2b
commit 33a189f620
10 changed files with 579 additions and 9 deletions
--- a/rag/app/naive.py
+++ b/rag/app/naive.py
@ -36,6 +36,7 @@ from deepdoc.parser.figure_parser import VisionFigureParser,vision_figure_parser
 from deepdoc.parser.pdf_parser import PlainParser, VisionParser
 from deepdoc.parser.mineru_parser import MinerUParser
 from deepdoc.parser.docling_parser import DoclingParser
+from deepdoc.parser.tcadp_parser import TCADPParser
 from rag.nlp import concat_img, find_codec, naive_merge, naive_merge_with_images, naive_merge_docx, rag_tokenizer, tokenize_chunks, tokenize_chunks_with_images, tokenize_table


@ -550,7 +551,23 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
            parser_config["chunk_token_num"] = 0
            res = tokenize_table(tables, doc, is_english)
            callback(0.8, "Finish parsing.")
-        
+
+
+        elif layout_recognizer == "TCADP Parser":
+            tcadp_parser = TCADPParser()
+            if not tcadp_parser.check_installation():
+                callback(-1, "TCADP parser not available. Please check Tencent Cloud API configuration.")
+                return res
+
+            sections, tables = tcadp_parser.parse_pdf(
+                filepath=filename,
+                binary=binary,
+                callback=callback,
+                output_dir=os.environ.get("TCADP_OUTPUT_DIR", ""),
+                file_type="PDF"
+            )
+            parser_config["chunk_token_num"] = 0
+            callback(0.8, "Finish parsing.")
        else:
            if layout_recognizer == "Plain Text":
                pdf_parser = PlainParser()