mirror of
https://github.com/infiniflow/ragflow.git
synced 2025-12-08 12:32:30 +08:00
Feat: add TCADP Parser (#10775)
### What problem does this PR solve? This PR adds a new TCADP (Tencent Cloud Advanced Document Processing) parser to RAGFlow, enabling users to leverage Tencent Cloud's document parsing capabilities for more accurate and structured document processing. The implementation includes: New TCADP Parser: A complete implementation of Tencent Cloud's document parsing API without SDK dependency Configuration Support: Added configuration options in service_conf.yaml for Tencent Cloud API credentials Frontend Integration: Updated UI components to support the new TCADP parser option Error Handling: Comprehensive error handling and retry mechanisms for API calls Result Processing: Support for both SSE streaming and JSON response formats from Tencent Cloud API ### Type of change - [x] New Feature (non-breaking change which adds functionality) --------- Co-authored-by: Kevin Hu <kevinhu.sh@gmail.com>
This commit is contained in:
@ -31,6 +31,7 @@ from api.utils.base64_image import image2id
|
||||
from deepdoc.parser import ExcelParser
|
||||
from deepdoc.parser.mineru_parser import MinerUParser
|
||||
from deepdoc.parser.pdf_parser import PlainParser, RAGFlowPdfParser, VisionParser
|
||||
from deepdoc.parser.tcadp_parser import TCADPParser
|
||||
from rag.app.naive import Docx
|
||||
from rag.flow.base import ProcessBase, ProcessParamBase
|
||||
from rag.flow.parser.schema import ParserFromUpstream
|
||||
@ -74,7 +75,7 @@ class ParserParam(ProcessParamBase):
|
||||
|
||||
self.setups = {
|
||||
"pdf": {
|
||||
"parse_method": "deepdoc", # deepdoc/plain_text/vlm
|
||||
"parse_method": "deepdoc", # deepdoc/plain_text/tcadp_parser/vlm
|
||||
"lang": "Chinese",
|
||||
"suffix": [
|
||||
"pdf",
|
||||
@ -157,7 +158,7 @@ class ParserParam(ProcessParamBase):
|
||||
pdf_parse_method = pdf_config.get("parse_method", "")
|
||||
self.check_empty(pdf_parse_method, "Parse method abnormal.")
|
||||
|
||||
if pdf_parse_method.lower() not in ["deepdoc", "plain_text", "mineru"]:
|
||||
if pdf_parse_method.lower() not in ["deepdoc", "plain_text", "mineru", "tcadp parser"]:
|
||||
self.check_empty(pdf_config.get("lang", ""), "PDF VLM language")
|
||||
|
||||
pdf_output_format = pdf_config.get("output_format", "")
|
||||
@ -240,6 +241,39 @@ class Parser(ProcessBase):
|
||||
"text": t,
|
||||
}
|
||||
bboxes.append(box)
|
||||
elif conf.get("parse_method").lower() == "tcadp parser":
|
||||
# ADP is a document parsing tool using Tencent Cloud API
|
||||
tcadp_parser = TCADPParser()
|
||||
sections, _ = tcadp_parser.parse_pdf(
|
||||
filepath=name,
|
||||
binary=blob,
|
||||
callback=self.callback,
|
||||
file_type="PDF",
|
||||
file_start_page=1,
|
||||
file_end_page=1000
|
||||
)
|
||||
bboxes = []
|
||||
for section, position_tag in sections:
|
||||
if position_tag:
|
||||
# Extract position information from TCADP's position tag
|
||||
# Format: @@{page_number}\t{x0}\t{x1}\t{top}\t{bottom}##
|
||||
import re
|
||||
match = re.match(r"@@([0-9-]+)\t([0-9.]+)\t([0-9.]+)\t([0-9.]+)\t([0-9.]+)##", position_tag)
|
||||
if match:
|
||||
pn, x0, x1, top, bott = match.groups()
|
||||
bboxes.append({
|
||||
"page_number": int(pn.split('-')[0]), # Take the first page number
|
||||
"x0": float(x0),
|
||||
"x1": float(x1),
|
||||
"top": float(top),
|
||||
"bottom": float(bott),
|
||||
"text": section
|
||||
})
|
||||
else:
|
||||
# If no position info, add as text without position
|
||||
bboxes.append({"text": section})
|
||||
else:
|
||||
bboxes.append({"text": section})
|
||||
else:
|
||||
vision_model = LLMBundle(self._canvas._tenant_id, LLMType.IMAGE2TEXT, llm_name=conf.get("parse_method"), lang=self._param.setups["pdf"].get("lang"))
|
||||
lines, _ = VisionParser(vision_model=vision_model)(blob, callback=self.callback)
|
||||
|
||||
Reference in New Issue
Block a user