Feat: add TCADP Parser (#10775)

### What problem does this PR solve?

This PR adds a new TCADP (Tencent Cloud Advanced Document Processing)
parser to RAGFlow, enabling users to leverage Tencent Cloud's document
parsing capabilities for more accurate and structured document
processing. The implementation includes:
New TCADP Parser: A complete implementation of Tencent Cloud's document
parsing API without SDK dependency
Configuration Support: Added configuration options in service_conf.yaml
for Tencent Cloud API credentials
Frontend Integration: Updated UI components to support the new TCADP
parser option
Error Handling: Comprehensive error handling and retry mechanisms for
API calls
Result Processing: Support for both SSE streaming and JSON response
formats from Tencent Cloud API

### Type of change

- [x] New Feature (non-breaking change which adds functionality)

---------

Co-authored-by: Kevin Hu <kevinhu.sh@gmail.com>
This commit is contained in:
aidan
2025-10-27 15:14:58 +08:00
committed by GitHub
parent 56def59c2b
commit 33a189f620
10 changed files with 579 additions and 9 deletions

View File

@ -36,6 +36,7 @@ from deepdoc.parser.figure_parser import VisionFigureParser,vision_figure_parser
from deepdoc.parser.pdf_parser import PlainParser, VisionParser
from deepdoc.parser.mineru_parser import MinerUParser
from deepdoc.parser.docling_parser import DoclingParser
from deepdoc.parser.tcadp_parser import TCADPParser
from rag.nlp import concat_img, find_codec, naive_merge, naive_merge_with_images, naive_merge_docx, rag_tokenizer, tokenize_chunks, tokenize_chunks_with_images, tokenize_table
@ -550,7 +551,23 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
parser_config["chunk_token_num"] = 0
res = tokenize_table(tables, doc, is_english)
callback(0.8, "Finish parsing.")
elif layout_recognizer == "TCADP Parser":
tcadp_parser = TCADPParser()
if not tcadp_parser.check_installation():
callback(-1, "TCADP parser not available. Please check Tencent Cloud API configuration.")
return res
sections, tables = tcadp_parser.parse_pdf(
filepath=filename,
binary=binary,
callback=callback,
output_dir=os.environ.get("TCADP_OUTPUT_DIR", ""),
file_type="PDF"
)
parser_config["chunk_token_num"] = 0
callback(0.8, "Finish parsing.")
else:
if layout_recognizer == "Plain Text":
pdf_parser = PlainParser()

View File

@ -31,6 +31,7 @@ from api.utils.base64_image import image2id
from deepdoc.parser import ExcelParser
from deepdoc.parser.mineru_parser import MinerUParser
from deepdoc.parser.pdf_parser import PlainParser, RAGFlowPdfParser, VisionParser
from deepdoc.parser.tcadp_parser import TCADPParser
from rag.app.naive import Docx
from rag.flow.base import ProcessBase, ProcessParamBase
from rag.flow.parser.schema import ParserFromUpstream
@ -74,7 +75,7 @@ class ParserParam(ProcessParamBase):
self.setups = {
"pdf": {
"parse_method": "deepdoc", # deepdoc/plain_text/vlm
"parse_method": "deepdoc", # deepdoc/plain_text/tcadp_parser/vlm
"lang": "Chinese",
"suffix": [
"pdf",
@ -157,7 +158,7 @@ class ParserParam(ProcessParamBase):
pdf_parse_method = pdf_config.get("parse_method", "")
self.check_empty(pdf_parse_method, "Parse method abnormal.")
if pdf_parse_method.lower() not in ["deepdoc", "plain_text", "mineru"]:
if pdf_parse_method.lower() not in ["deepdoc", "plain_text", "mineru", "tcadp parser"]:
self.check_empty(pdf_config.get("lang", ""), "PDF VLM language")
pdf_output_format = pdf_config.get("output_format", "")
@ -240,6 +241,39 @@ class Parser(ProcessBase):
"text": t,
}
bboxes.append(box)
elif conf.get("parse_method").lower() == "tcadp parser":
# ADP is a document parsing tool using Tencent Cloud API
tcadp_parser = TCADPParser()
sections, _ = tcadp_parser.parse_pdf(
filepath=name,
binary=blob,
callback=self.callback,
file_type="PDF",
file_start_page=1,
file_end_page=1000
)
bboxes = []
for section, position_tag in sections:
if position_tag:
# Extract position information from TCADP's position tag
# Format: @@{page_number}\t{x0}\t{x1}\t{top}\t{bottom}##
import re
match = re.match(r"@@([0-9-]+)\t([0-9.]+)\t([0-9.]+)\t([0-9.]+)\t([0-9.]+)##", position_tag)
if match:
pn, x0, x1, top, bott = match.groups()
bboxes.append({
"page_number": int(pn.split('-')[0]), # Take the first page number
"x0": float(x0),
"x1": float(x1),
"top": float(top),
"bottom": float(bott),
"text": section
})
else:
# If no position info, add as text without position
bboxes.append({"text": section})
else:
bboxes.append({"text": section})
else:
vision_model = LLMBundle(self._canvas._tenant_id, LLMType.IMAGE2TEXT, llm_name=conf.get("parse_method"), lang=self._param.setups["pdf"].get("lang"))
lines, _ = VisionParser(vision_model=vision_model)(blob, callback=self.callback)