mirror of
https://github.com/infiniflow/ragflow.git
synced 2025-12-08 20:42:30 +08:00
Feat: add TCADP Parser (#10775)
### What problem does this PR solve? This PR adds a new TCADP (Tencent Cloud Advanced Document Processing) parser to RAGFlow, enabling users to leverage Tencent Cloud's document parsing capabilities for more accurate and structured document processing. The implementation includes: New TCADP Parser: A complete implementation of Tencent Cloud's document parsing API without SDK dependency Configuration Support: Added configuration options in service_conf.yaml for Tencent Cloud API credentials Frontend Integration: Updated UI components to support the new TCADP parser option Error Handling: Comprehensive error handling and retry mechanisms for API calls Result Processing: Support for both SSE streaming and JSON response formats from Tencent Cloud API ### Type of change - [x] New Feature (non-breaking change which adds functionality) --------- Co-authored-by: Kevin Hu <kevinhu.sh@gmail.com>
This commit is contained in:
@ -133,3 +133,9 @@ user_default_llm:
|
|||||||
# - "RAGFlow" # display name
|
# - "RAGFlow" # display name
|
||||||
# - "" # sender email address
|
# - "" # sender email address
|
||||||
# mail_frontend_url: "https://your-frontend.example.com"
|
# mail_frontend_url: "https://your-frontend.example.com"
|
||||||
|
# tcadp_config:
|
||||||
|
# secret_id: 'tencent_secret_id'
|
||||||
|
# secret_key: 'tencent_secret_key'
|
||||||
|
# region: 'tencent_region'
|
||||||
|
# table_result_type: '1'
|
||||||
|
# markdown_image_response_type: '1'
|
||||||
|
|||||||
504
deepdoc/parser/tcadp_parser.py
Normal file
504
deepdoc/parser/tcadp_parser.py
Normal file
@ -0,0 +1,504 @@
|
|||||||
|
#
|
||||||
|
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
#
|
||||||
|
import base64
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
import shutil
|
||||||
|
import tempfile
|
||||||
|
import time
|
||||||
|
import traceback
|
||||||
|
import types
|
||||||
|
import zipfile
|
||||||
|
from datetime import datetime
|
||||||
|
from io import BytesIO
|
||||||
|
from os import PathLike
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Any, Callable, Optional
|
||||||
|
|
||||||
|
import requests
|
||||||
|
from tencentcloud.common import credential
|
||||||
|
from tencentcloud.common.profile.client_profile import ClientProfile
|
||||||
|
from tencentcloud.common.profile.http_profile import HttpProfile
|
||||||
|
from tencentcloud.common.exception.tencent_cloud_sdk_exception import TencentCloudSDKException
|
||||||
|
from tencentcloud.lkeap.v20240522 import lkeap_client, models
|
||||||
|
|
||||||
|
from api.utils.configs import get_base_config
|
||||||
|
from deepdoc.parser.pdf_parser import RAGFlowPdfParser
|
||||||
|
|
||||||
|
|
||||||
|
class TencentCloudAPIClient:
|
||||||
|
"""Tencent Cloud API client using official SDK"""
|
||||||
|
|
||||||
|
def __init__(self, secret_id, secret_key, region):
|
||||||
|
self.secret_id = secret_id
|
||||||
|
self.secret_key = secret_key
|
||||||
|
self.region = region
|
||||||
|
|
||||||
|
# Create credentials
|
||||||
|
self.cred = credential.Credential(secret_id, secret_key)
|
||||||
|
|
||||||
|
# Instantiate an http option, optional, can be skipped if no special requirements
|
||||||
|
self.httpProfile = HttpProfile()
|
||||||
|
self.httpProfile.endpoint = "lkeap.tencentcloudapi.com"
|
||||||
|
|
||||||
|
# Instantiate a client option, optional, can be skipped if no special requirements
|
||||||
|
self.clientProfile = ClientProfile()
|
||||||
|
self.clientProfile.httpProfile = self.httpProfile
|
||||||
|
|
||||||
|
# Instantiate the client object for the product to be requested, clientProfile is optional
|
||||||
|
self.client = lkeap_client.LkeapClient(self.cred, region, self.clientProfile)
|
||||||
|
|
||||||
|
def reconstruct_document_sse(self, file_type, file_url=None, file_base64=None, file_start_page=1, file_end_page=1000, config=None):
|
||||||
|
"""Call document parsing API using official SDK"""
|
||||||
|
try:
|
||||||
|
# Instantiate a request object, each interface corresponds to a request object
|
||||||
|
req = models.ReconstructDocumentSSERequest()
|
||||||
|
|
||||||
|
# Build request parameters
|
||||||
|
params = {
|
||||||
|
"FileType": file_type,
|
||||||
|
"FileStartPageNumber": file_start_page,
|
||||||
|
"FileEndPageNumber": file_end_page,
|
||||||
|
}
|
||||||
|
|
||||||
|
# According to Tencent Cloud API documentation, either FileUrl or FileBase64 parameter must be provided, if both are provided only FileUrl will be used
|
||||||
|
if file_url:
|
||||||
|
params["FileUrl"] = file_url
|
||||||
|
logging.info(f"[TCADP] Using file URL: {file_url}")
|
||||||
|
elif file_base64:
|
||||||
|
params["FileBase64"] = file_base64
|
||||||
|
logging.info(f"[TCADP] Using Base64 data, length: {len(file_base64)} characters")
|
||||||
|
else:
|
||||||
|
raise ValueError("Must provide either FileUrl or FileBase64 parameter")
|
||||||
|
|
||||||
|
if config:
|
||||||
|
params["Config"] = config
|
||||||
|
|
||||||
|
req.from_json_string(json.dumps(params))
|
||||||
|
|
||||||
|
# The returned resp is an instance of ReconstructDocumentSSEResponse, corresponding to the request object
|
||||||
|
resp = self.client.ReconstructDocumentSSE(req)
|
||||||
|
parser_result = {}
|
||||||
|
|
||||||
|
# Output json format string response
|
||||||
|
if isinstance(resp, types.GeneratorType): # Streaming response
|
||||||
|
logging.info("[TCADP] Detected streaming response")
|
||||||
|
for event in resp:
|
||||||
|
logging.info(f"[TCADP] Received event: {event}")
|
||||||
|
if event.get('data'):
|
||||||
|
try:
|
||||||
|
data_dict = json.loads(event['data'])
|
||||||
|
logging.info(f"[TCADP] Parsed data: {data_dict}")
|
||||||
|
|
||||||
|
if data_dict.get('Progress') == "100":
|
||||||
|
parser_result = data_dict
|
||||||
|
logging.info("[TCADP] Document parsing completed!")
|
||||||
|
logging.info(f"[TCADP] Task ID: {data_dict.get('TaskId')}")
|
||||||
|
logging.info(f"[TCADP] Success pages: {data_dict.get('SuccessPageNum')}")
|
||||||
|
logging.info(f"[TCADP] Failed pages: {data_dict.get('FailPageNum')}")
|
||||||
|
|
||||||
|
# Print failed page information
|
||||||
|
failed_pages = data_dict.get("FailedPages", [])
|
||||||
|
if failed_pages:
|
||||||
|
logging.warning("[TCADP] Failed parsing pages:")
|
||||||
|
for page in failed_pages:
|
||||||
|
logging.warning(f"[TCADP] Page number: {page.get('PageNumber')}, Error: {page.get('ErrorMsg')}")
|
||||||
|
|
||||||
|
# Check if there is a download link
|
||||||
|
download_url = data_dict.get("DocumentRecognizeResultUrl")
|
||||||
|
if download_url:
|
||||||
|
logging.info(f"[TCADP] Got download link: {download_url}")
|
||||||
|
else:
|
||||||
|
logging.warning("[TCADP] No download link obtained")
|
||||||
|
|
||||||
|
break # Found final result, exit loop
|
||||||
|
else:
|
||||||
|
# Print progress information
|
||||||
|
progress = data_dict.get("Progress", "0")
|
||||||
|
logging.info(f"[TCADP] Progress: {progress}%")
|
||||||
|
except json.JSONDecodeError as e:
|
||||||
|
logging.error(f"[TCADP] Failed to parse JSON data: {e}")
|
||||||
|
logging.error(f"[TCADP] Raw data: {event.get('data')}")
|
||||||
|
continue
|
||||||
|
else:
|
||||||
|
logging.info(f"[TCADP] Event without data: {event}")
|
||||||
|
else: # Non-streaming response
|
||||||
|
logging.info("[TCADP] Detected non-streaming response")
|
||||||
|
if hasattr(resp, 'data') and resp.data:
|
||||||
|
try:
|
||||||
|
data_dict = json.loads(resp.data)
|
||||||
|
parser_result = data_dict
|
||||||
|
logging.info(f"[TCADP] JSON parsing successful: {parser_result}")
|
||||||
|
except json.JSONDecodeError as e:
|
||||||
|
logging.error(f"[TCADP] JSON parsing failed: {e}")
|
||||||
|
return None
|
||||||
|
else:
|
||||||
|
logging.error("[TCADP] No data in response")
|
||||||
|
return None
|
||||||
|
|
||||||
|
return parser_result
|
||||||
|
|
||||||
|
except TencentCloudSDKException as err:
|
||||||
|
logging.error(f"[TCADP] Tencent Cloud SDK error: {err}")
|
||||||
|
return None
|
||||||
|
except Exception as e:
|
||||||
|
logging.error(f"[TCADP] Unknown error: {e}")
|
||||||
|
logging.error(f"[TCADP] Error stack trace: {traceback.format_exc()}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
def download_result_file(self, download_url, output_dir):
|
||||||
|
"""Download parsing result file"""
|
||||||
|
if not download_url:
|
||||||
|
logging.warning("[TCADP] No downloadable result file")
|
||||||
|
return None
|
||||||
|
|
||||||
|
try:
|
||||||
|
response = requests.get(download_url)
|
||||||
|
response.raise_for_status()
|
||||||
|
|
||||||
|
# Ensure output directory exists
|
||||||
|
os.makedirs(output_dir, exist_ok=True)
|
||||||
|
|
||||||
|
# Generate filename
|
||||||
|
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||||
|
filename = f"tcadp_result_{timestamp}.zip"
|
||||||
|
file_path = os.path.join(output_dir, filename)
|
||||||
|
|
||||||
|
# Save file
|
||||||
|
with open(file_path, "wb") as f:
|
||||||
|
f.write(response.content)
|
||||||
|
|
||||||
|
logging.info(f"[TCADP] Document parsing result downloaded to: {os.path.basename(file_path)}")
|
||||||
|
return file_path
|
||||||
|
|
||||||
|
except requests.exceptions.RequestException as e:
|
||||||
|
logging.error(f"[TCADP] Failed to download file: {e}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
class TCADPParser(RAGFlowPdfParser):
|
||||||
|
def __init__(self, secret_id: str = None, secret_key: str = None, region: str = "ap-guangzhou"):
|
||||||
|
super().__init__()
|
||||||
|
|
||||||
|
# First initialize logger
|
||||||
|
self.logger = logging.getLogger(self.__class__.__name__)
|
||||||
|
|
||||||
|
# Priority: read configuration from RAGFlow configuration system (service_conf.yaml)
|
||||||
|
try:
|
||||||
|
tcadp_parser = get_base_config("tcadp_config", {})
|
||||||
|
if isinstance(tcadp_parser, dict) and tcadp_parser:
|
||||||
|
self.secret_id = secret_id or tcadp_parser.get("secret_id")
|
||||||
|
self.secret_key = secret_key or tcadp_parser.get("secret_key")
|
||||||
|
self.region = region or tcadp_parser.get("region", "ap-guangzhou")
|
||||||
|
self.table_result_type = tcadp_parser.get("table_result_type", "1")
|
||||||
|
self.markdown_image_response_type = tcadp_parser.get("markdown_image_response_type", "1")
|
||||||
|
self.logger.info("[TCADP] Configuration read from service_conf.yaml")
|
||||||
|
else:
|
||||||
|
self.logger.error("[TCADP] Please configure tcadp_config in service_conf.yaml first")
|
||||||
|
|
||||||
|
except ImportError:
|
||||||
|
self.logger.info("[TCADP] Configuration module import failed")
|
||||||
|
|
||||||
|
if not self.secret_id or not self.secret_key:
|
||||||
|
raise ValueError("[TCADP] Please set Tencent Cloud API keys, configure tcadp_config in service_conf.yaml")
|
||||||
|
|
||||||
|
def check_installation(self) -> bool:
|
||||||
|
"""Check if Tencent Cloud API configuration is correct"""
|
||||||
|
try:
|
||||||
|
# Check necessary configuration parameters
|
||||||
|
if not self.secret_id or not self.secret_key:
|
||||||
|
self.logger.error("[TCADP] Tencent Cloud API configuration incomplete")
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Try to create client to verify configuration
|
||||||
|
TencentCloudAPIClient(self.secret_id, self.secret_key, self.region)
|
||||||
|
self.logger.info("[TCADP] Tencent Cloud API configuration check passed")
|
||||||
|
return True
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.error(f"[TCADP] Tencent Cloud API configuration check failed: {e}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
def _file_to_base64(self, file_path: str, binary: bytes = None) -> str:
|
||||||
|
"""Convert file to Base64 format"""
|
||||||
|
|
||||||
|
if binary:
|
||||||
|
# If binary data is directly available, convert directly
|
||||||
|
return base64.b64encode(binary).decode('utf-8')
|
||||||
|
else:
|
||||||
|
# Read from file path and convert
|
||||||
|
with open(file_path, 'rb') as f:
|
||||||
|
file_data = f.read()
|
||||||
|
return base64.b64encode(file_data).decode('utf-8')
|
||||||
|
|
||||||
|
def _extract_content_from_zip(self, zip_path: str) -> list[dict[str, Any]]:
|
||||||
|
"""Extract parsing results from downloaded ZIP file"""
|
||||||
|
results = []
|
||||||
|
|
||||||
|
try:
|
||||||
|
with zipfile.ZipFile(zip_path, "r") as zip_file:
|
||||||
|
# Find JSON result files
|
||||||
|
json_files = [f for f in zip_file.namelist() if f.endswith(".json")]
|
||||||
|
|
||||||
|
for json_file in json_files:
|
||||||
|
with zip_file.open(json_file) as f:
|
||||||
|
data = json.load(f)
|
||||||
|
if isinstance(data, list):
|
||||||
|
results.extend(data)
|
||||||
|
else:
|
||||||
|
results.append(data)
|
||||||
|
|
||||||
|
# Find Markdown files
|
||||||
|
md_files = [f for f in zip_file.namelist() if f.endswith(".md")]
|
||||||
|
for md_file in md_files:
|
||||||
|
with zip_file.open(md_file) as f:
|
||||||
|
content = f.read().decode("utf-8")
|
||||||
|
results.append({"type": "text", "content": content, "file": md_file})
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.error(f"[TCADP] Failed to extract ZIP file content: {e}")
|
||||||
|
|
||||||
|
return results
|
||||||
|
|
||||||
|
def _parse_content_to_sections(self, content_data: list[dict[str, Any]]) -> list[tuple[str, str]]:
|
||||||
|
"""Convert parsing results to sections format"""
|
||||||
|
sections = []
|
||||||
|
|
||||||
|
for item in content_data:
|
||||||
|
content_type = item.get("type", "text")
|
||||||
|
content = item.get("content", "")
|
||||||
|
|
||||||
|
if not content:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Process based on content type
|
||||||
|
if content_type == "text" or content_type == "paragraph":
|
||||||
|
section_text = content
|
||||||
|
elif content_type == "table":
|
||||||
|
# Handle table content
|
||||||
|
table_data = item.get("table_data", {})
|
||||||
|
if isinstance(table_data, dict):
|
||||||
|
# Convert table data to text
|
||||||
|
rows = table_data.get("rows", [])
|
||||||
|
section_text = "\n".join([" | ".join(row) for row in rows])
|
||||||
|
else:
|
||||||
|
section_text = str(table_data)
|
||||||
|
elif content_type == "image":
|
||||||
|
# Handle image content
|
||||||
|
caption = item.get("caption", "")
|
||||||
|
section_text = f"[Image] {caption}" if caption else "[Image]"
|
||||||
|
elif content_type == "equation":
|
||||||
|
# Handle equation content
|
||||||
|
section_text = f"$${content}$$"
|
||||||
|
else:
|
||||||
|
section_text = content
|
||||||
|
|
||||||
|
if section_text.strip():
|
||||||
|
# Generate position tag (simplified version)
|
||||||
|
position_tag = "@@1\t0.0\t1000.0\t0.0\t100.0##"
|
||||||
|
sections.append((section_text, position_tag))
|
||||||
|
|
||||||
|
return sections
|
||||||
|
|
||||||
|
def _parse_content_to_tables(self, content_data: list[dict[str, Any]]) -> list:
|
||||||
|
"""Convert parsing results to tables format"""
|
||||||
|
tables = []
|
||||||
|
|
||||||
|
for item in content_data:
|
||||||
|
if item.get("type") == "table":
|
||||||
|
table_data = item.get("table_data", {})
|
||||||
|
if isinstance(table_data, dict):
|
||||||
|
rows = table_data.get("rows", [])
|
||||||
|
if rows:
|
||||||
|
# Convert to table format
|
||||||
|
table_html = "<table>\n"
|
||||||
|
for i, row in enumerate(rows):
|
||||||
|
table_html += " <tr>\n"
|
||||||
|
for cell in row:
|
||||||
|
tag = "th" if i == 0 else "td"
|
||||||
|
table_html += f" <{tag}>{cell}</{tag}>\n"
|
||||||
|
table_html += " </tr>\n"
|
||||||
|
table_html += "</table>"
|
||||||
|
tables.append(table_html)
|
||||||
|
|
||||||
|
return tables
|
||||||
|
|
||||||
|
def parse_pdf(
|
||||||
|
self,
|
||||||
|
filepath: str | PathLike[str],
|
||||||
|
binary: BytesIO | bytes,
|
||||||
|
callback: Optional[Callable] = None,
|
||||||
|
*,
|
||||||
|
output_dir: Optional[str] = None,
|
||||||
|
file_type: str = "PDF",
|
||||||
|
file_start_page: Optional[int] = 1,
|
||||||
|
file_end_page: Optional[int] = 1000,
|
||||||
|
delete_output: Optional[bool] = True,
|
||||||
|
max_retries: Optional[int] = 1,
|
||||||
|
) -> tuple:
|
||||||
|
"""Parse PDF document"""
|
||||||
|
|
||||||
|
temp_file = None
|
||||||
|
created_tmp_dir = False
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Handle input file
|
||||||
|
if binary:
|
||||||
|
temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".pdf")
|
||||||
|
temp_file.write(binary)
|
||||||
|
temp_file.close()
|
||||||
|
file_path = temp_file.name
|
||||||
|
self.logger.info(f"[TCADP] Received binary PDF -> {os.path.basename(file_path)}")
|
||||||
|
if callback:
|
||||||
|
callback(0.1, f"[TCADP] Received binary PDF -> {os.path.basename(file_path)}")
|
||||||
|
else:
|
||||||
|
file_path = str(filepath)
|
||||||
|
if not os.path.exists(file_path):
|
||||||
|
if callback:
|
||||||
|
callback(-1, f"[TCADP] PDF file does not exist: {file_path}")
|
||||||
|
raise FileNotFoundError(f"[TCADP] PDF file does not exist: {file_path}")
|
||||||
|
|
||||||
|
# Convert file to Base64 format
|
||||||
|
if callback:
|
||||||
|
callback(0.2, "[TCADP] Converting file to Base64 format")
|
||||||
|
|
||||||
|
file_base64 = self._file_to_base64(file_path, binary)
|
||||||
|
if callback:
|
||||||
|
callback(0.25, f"[TCADP] File converted to Base64, size: {len(file_base64)} characters")
|
||||||
|
|
||||||
|
# Create Tencent Cloud API client
|
||||||
|
client = TencentCloudAPIClient(self.secret_id, self.secret_key, self.region)
|
||||||
|
|
||||||
|
# Call document parsing API (with retry mechanism)
|
||||||
|
if callback:
|
||||||
|
callback(0.3, "[TCADP] Starting to call Tencent Cloud document parsing API")
|
||||||
|
|
||||||
|
result = None
|
||||||
|
for attempt in range(max_retries):
|
||||||
|
try:
|
||||||
|
if attempt > 0:
|
||||||
|
self.logger.info(f"[TCADP] Retry attempt {attempt + 1}")
|
||||||
|
if callback:
|
||||||
|
callback(0.3 + attempt * 0.1, f"[TCADP] Retry attempt {attempt + 1}")
|
||||||
|
time.sleep(2 ** attempt) # Exponential backoff
|
||||||
|
|
||||||
|
config = {
|
||||||
|
"TableResultType": self.table_result_type,
|
||||||
|
"MarkdownImageResponseType": self.markdown_image_response_type
|
||||||
|
}
|
||||||
|
|
||||||
|
result = client.reconstruct_document_sse(
|
||||||
|
file_type=file_type,
|
||||||
|
file_base64=file_base64,
|
||||||
|
file_start_page=file_start_page,
|
||||||
|
file_end_page=file_end_page,
|
||||||
|
config=config
|
||||||
|
)
|
||||||
|
|
||||||
|
if result:
|
||||||
|
self.logger.info(f"[TCADP] Attempt {attempt + 1} successful")
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
self.logger.warning(f"[TCADP] Attempt {attempt + 1} failed, result is None")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.error(f"[TCADP] Attempt {attempt + 1} exception: {e}")
|
||||||
|
if attempt == max_retries - 1:
|
||||||
|
raise
|
||||||
|
|
||||||
|
if not result:
|
||||||
|
error_msg = f"[TCADP] Document parsing failed, retried {max_retries} times"
|
||||||
|
self.logger.error(error_msg)
|
||||||
|
if callback:
|
||||||
|
callback(-1, error_msg)
|
||||||
|
raise RuntimeError(error_msg)
|
||||||
|
|
||||||
|
# Get download link
|
||||||
|
download_url = result.get("DocumentRecognizeResultUrl")
|
||||||
|
if not download_url:
|
||||||
|
if callback:
|
||||||
|
callback(-1, "[TCADP] No parsing result download link obtained")
|
||||||
|
raise RuntimeError("[TCADP] No parsing result download link obtained")
|
||||||
|
|
||||||
|
if callback:
|
||||||
|
callback(0.6, f"[TCADP] Parsing result download link: {download_url}")
|
||||||
|
|
||||||
|
# Set output directory
|
||||||
|
if output_dir:
|
||||||
|
out_dir = Path(output_dir)
|
||||||
|
out_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
else:
|
||||||
|
out_dir = Path(tempfile.mkdtemp(prefix="adp_pdf_"))
|
||||||
|
created_tmp_dir = True
|
||||||
|
|
||||||
|
# Download result file
|
||||||
|
zip_path = client.download_result_file(download_url, str(out_dir))
|
||||||
|
if not zip_path:
|
||||||
|
if callback:
|
||||||
|
callback(-1, "[TCADP] Failed to download parsing result")
|
||||||
|
raise RuntimeError("[TCADP] Failed to download parsing result")
|
||||||
|
|
||||||
|
if callback:
|
||||||
|
# Shorten file path display, only show filename
|
||||||
|
zip_filename = os.path.basename(zip_path)
|
||||||
|
callback(0.8, f"[TCADP] Parsing result downloaded: {zip_filename}")
|
||||||
|
|
||||||
|
# Extract ZIP file content
|
||||||
|
content_data = self._extract_content_from_zip(zip_path)
|
||||||
|
self.logger.info(f"[TCADP] Extracted {len(content_data)} content blocks")
|
||||||
|
|
||||||
|
if callback:
|
||||||
|
callback(0.9, f"[TCADP] Extracted {len(content_data)} content blocks")
|
||||||
|
|
||||||
|
# Convert to sections and tables format
|
||||||
|
sections = self._parse_content_to_sections(content_data)
|
||||||
|
tables = self._parse_content_to_tables(content_data)
|
||||||
|
|
||||||
|
self.logger.info(f"[TCADP] Parsing completed: {len(sections)} sections, {len(tables)} tables")
|
||||||
|
|
||||||
|
if callback:
|
||||||
|
callback(1.0, f"[TCADP] Parsing completed: {len(sections)} sections, {len(tables)} tables")
|
||||||
|
|
||||||
|
return sections, tables
|
||||||
|
|
||||||
|
finally:
|
||||||
|
# Clean up temporary files
|
||||||
|
if temp_file and os.path.exists(temp_file.name):
|
||||||
|
try:
|
||||||
|
os.unlink(temp_file.name)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
if delete_output and created_tmp_dir and out_dir.exists():
|
||||||
|
try:
|
||||||
|
shutil.rmtree(out_dir)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
# Test ADP parser
|
||||||
|
parser = TCADPParser()
|
||||||
|
print("ADP available:", parser.check_installation())
|
||||||
|
|
||||||
|
# Test parsing
|
||||||
|
filepath = ""
|
||||||
|
if filepath and os.path.exists(filepath):
|
||||||
|
with open(filepath, "rb") as file:
|
||||||
|
sections, tables = parser.parse_pdf(filepath=filepath, binary=file.read())
|
||||||
|
print(f"Parsing result: {len(sections)} sections, {len(tables)} tables")
|
||||||
|
for i, (section, tag) in enumerate(sections[:3]): # Only print first 3
|
||||||
|
print(f"Section {i + 1}: {section[:100]}...")
|
||||||
@ -138,3 +138,9 @@ user_default_llm:
|
|||||||
# - "RAGFlow" # display name
|
# - "RAGFlow" # display name
|
||||||
# - "" # sender email address
|
# - "" # sender email address
|
||||||
# mail_frontend_url: "https://your-frontend.example.com"
|
# mail_frontend_url: "https://your-frontend.example.com"
|
||||||
|
# tcadp_config:
|
||||||
|
# secret_id: '${TENCENT_SECRET_ID}'
|
||||||
|
# secret_key: '${TENCENT_SECRET_KEY}'
|
||||||
|
# region: '${TENCENT_REGION}'
|
||||||
|
# table_result_type: '1'
|
||||||
|
# markdown_image_response_type: '1'
|
||||||
|
|||||||
@ -98,7 +98,7 @@ dependencies = [
|
|||||||
"strenum==0.4.15",
|
"strenum==0.4.15",
|
||||||
"tabulate==0.9.0",
|
"tabulate==0.9.0",
|
||||||
"tavily-python==0.5.1",
|
"tavily-python==0.5.1",
|
||||||
"tencentcloud-sdk-python==3.0.1215",
|
"tencentcloud-sdk-python==3.0.1478",
|
||||||
"tika==2.6.0",
|
"tika==2.6.0",
|
||||||
"tiktoken==0.7.0",
|
"tiktoken==0.7.0",
|
||||||
"umap_learn==0.5.6",
|
"umap_learn==0.5.6",
|
||||||
|
|||||||
@ -36,6 +36,7 @@ from deepdoc.parser.figure_parser import VisionFigureParser,vision_figure_parser
|
|||||||
from deepdoc.parser.pdf_parser import PlainParser, VisionParser
|
from deepdoc.parser.pdf_parser import PlainParser, VisionParser
|
||||||
from deepdoc.parser.mineru_parser import MinerUParser
|
from deepdoc.parser.mineru_parser import MinerUParser
|
||||||
from deepdoc.parser.docling_parser import DoclingParser
|
from deepdoc.parser.docling_parser import DoclingParser
|
||||||
|
from deepdoc.parser.tcadp_parser import TCADPParser
|
||||||
from rag.nlp import concat_img, find_codec, naive_merge, naive_merge_with_images, naive_merge_docx, rag_tokenizer, tokenize_chunks, tokenize_chunks_with_images, tokenize_table
|
from rag.nlp import concat_img, find_codec, naive_merge, naive_merge_with_images, naive_merge_docx, rag_tokenizer, tokenize_chunks, tokenize_chunks_with_images, tokenize_table
|
||||||
|
|
||||||
|
|
||||||
@ -550,7 +551,23 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
|||||||
parser_config["chunk_token_num"] = 0
|
parser_config["chunk_token_num"] = 0
|
||||||
res = tokenize_table(tables, doc, is_english)
|
res = tokenize_table(tables, doc, is_english)
|
||||||
callback(0.8, "Finish parsing.")
|
callback(0.8, "Finish parsing.")
|
||||||
|
|
||||||
|
|
||||||
|
elif layout_recognizer == "TCADP Parser":
|
||||||
|
tcadp_parser = TCADPParser()
|
||||||
|
if not tcadp_parser.check_installation():
|
||||||
|
callback(-1, "TCADP parser not available. Please check Tencent Cloud API configuration.")
|
||||||
|
return res
|
||||||
|
|
||||||
|
sections, tables = tcadp_parser.parse_pdf(
|
||||||
|
filepath=filename,
|
||||||
|
binary=binary,
|
||||||
|
callback=callback,
|
||||||
|
output_dir=os.environ.get("TCADP_OUTPUT_DIR", ""),
|
||||||
|
file_type="PDF"
|
||||||
|
)
|
||||||
|
parser_config["chunk_token_num"] = 0
|
||||||
|
callback(0.8, "Finish parsing.")
|
||||||
else:
|
else:
|
||||||
if layout_recognizer == "Plain Text":
|
if layout_recognizer == "Plain Text":
|
||||||
pdf_parser = PlainParser()
|
pdf_parser = PlainParser()
|
||||||
|
|||||||
@ -31,6 +31,7 @@ from api.utils.base64_image import image2id
|
|||||||
from deepdoc.parser import ExcelParser
|
from deepdoc.parser import ExcelParser
|
||||||
from deepdoc.parser.mineru_parser import MinerUParser
|
from deepdoc.parser.mineru_parser import MinerUParser
|
||||||
from deepdoc.parser.pdf_parser import PlainParser, RAGFlowPdfParser, VisionParser
|
from deepdoc.parser.pdf_parser import PlainParser, RAGFlowPdfParser, VisionParser
|
||||||
|
from deepdoc.parser.tcadp_parser import TCADPParser
|
||||||
from rag.app.naive import Docx
|
from rag.app.naive import Docx
|
||||||
from rag.flow.base import ProcessBase, ProcessParamBase
|
from rag.flow.base import ProcessBase, ProcessParamBase
|
||||||
from rag.flow.parser.schema import ParserFromUpstream
|
from rag.flow.parser.schema import ParserFromUpstream
|
||||||
@ -74,7 +75,7 @@ class ParserParam(ProcessParamBase):
|
|||||||
|
|
||||||
self.setups = {
|
self.setups = {
|
||||||
"pdf": {
|
"pdf": {
|
||||||
"parse_method": "deepdoc", # deepdoc/plain_text/vlm
|
"parse_method": "deepdoc", # deepdoc/plain_text/tcadp_parser/vlm
|
||||||
"lang": "Chinese",
|
"lang": "Chinese",
|
||||||
"suffix": [
|
"suffix": [
|
||||||
"pdf",
|
"pdf",
|
||||||
@ -157,7 +158,7 @@ class ParserParam(ProcessParamBase):
|
|||||||
pdf_parse_method = pdf_config.get("parse_method", "")
|
pdf_parse_method = pdf_config.get("parse_method", "")
|
||||||
self.check_empty(pdf_parse_method, "Parse method abnormal.")
|
self.check_empty(pdf_parse_method, "Parse method abnormal.")
|
||||||
|
|
||||||
if pdf_parse_method.lower() not in ["deepdoc", "plain_text", "mineru"]:
|
if pdf_parse_method.lower() not in ["deepdoc", "plain_text", "mineru", "tcadp parser"]:
|
||||||
self.check_empty(pdf_config.get("lang", ""), "PDF VLM language")
|
self.check_empty(pdf_config.get("lang", ""), "PDF VLM language")
|
||||||
|
|
||||||
pdf_output_format = pdf_config.get("output_format", "")
|
pdf_output_format = pdf_config.get("output_format", "")
|
||||||
@ -240,6 +241,39 @@ class Parser(ProcessBase):
|
|||||||
"text": t,
|
"text": t,
|
||||||
}
|
}
|
||||||
bboxes.append(box)
|
bboxes.append(box)
|
||||||
|
elif conf.get("parse_method").lower() == "tcadp parser":
|
||||||
|
# ADP is a document parsing tool using Tencent Cloud API
|
||||||
|
tcadp_parser = TCADPParser()
|
||||||
|
sections, _ = tcadp_parser.parse_pdf(
|
||||||
|
filepath=name,
|
||||||
|
binary=blob,
|
||||||
|
callback=self.callback,
|
||||||
|
file_type="PDF",
|
||||||
|
file_start_page=1,
|
||||||
|
file_end_page=1000
|
||||||
|
)
|
||||||
|
bboxes = []
|
||||||
|
for section, position_tag in sections:
|
||||||
|
if position_tag:
|
||||||
|
# Extract position information from TCADP's position tag
|
||||||
|
# Format: @@{page_number}\t{x0}\t{x1}\t{top}\t{bottom}##
|
||||||
|
import re
|
||||||
|
match = re.match(r"@@([0-9-]+)\t([0-9.]+)\t([0-9.]+)\t([0-9.]+)\t([0-9.]+)##", position_tag)
|
||||||
|
if match:
|
||||||
|
pn, x0, x1, top, bott = match.groups()
|
||||||
|
bboxes.append({
|
||||||
|
"page_number": int(pn.split('-')[0]), # Take the first page number
|
||||||
|
"x0": float(x0),
|
||||||
|
"x1": float(x1),
|
||||||
|
"top": float(top),
|
||||||
|
"bottom": float(bott),
|
||||||
|
"text": section
|
||||||
|
})
|
||||||
|
else:
|
||||||
|
# If no position info, add as text without position
|
||||||
|
bboxes.append({"text": section})
|
||||||
|
else:
|
||||||
|
bboxes.append({"text": section})
|
||||||
else:
|
else:
|
||||||
vision_model = LLMBundle(self._canvas._tenant_id, LLMType.IMAGE2TEXT, llm_name=conf.get("parse_method"), lang=self._param.setups["pdf"].get("lang"))
|
vision_model = LLMBundle(self._canvas._tenant_id, LLMType.IMAGE2TEXT, llm_name=conf.get("parse_method"), lang=self._param.setups["pdf"].get("lang"))
|
||||||
lines, _ = VisionParser(vision_model=vision_model)(blob, callback=self.callback)
|
lines, _ = VisionParser(vision_model=vision_model)(blob, callback=self.callback)
|
||||||
|
|||||||
8
uv.lock
generated
8
uv.lock
generated
@ -5450,7 +5450,7 @@ requires-dist = [
|
|||||||
{ name = "strenum", specifier = "==0.4.15" },
|
{ name = "strenum", specifier = "==0.4.15" },
|
||||||
{ name = "tabulate", specifier = "==0.9.0" },
|
{ name = "tabulate", specifier = "==0.9.0" },
|
||||||
{ name = "tavily-python", specifier = "==0.5.1" },
|
{ name = "tavily-python", specifier = "==0.5.1" },
|
||||||
{ name = "tencentcloud-sdk-python", specifier = "==3.0.1215" },
|
{ name = "tencentcloud-sdk-python", specifier = "==3.0.1478" },
|
||||||
{ name = "tika", specifier = "==2.6.0" },
|
{ name = "tika", specifier = "==2.6.0" },
|
||||||
{ name = "tiktoken", specifier = "==0.7.0" },
|
{ name = "tiktoken", specifier = "==0.7.0" },
|
||||||
{ name = "trio", specifier = ">=0.29.0" },
|
{ name = "trio", specifier = ">=0.29.0" },
|
||||||
@ -6508,14 +6508,14 @@ wheels = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "tencentcloud-sdk-python"
|
name = "tencentcloud-sdk-python"
|
||||||
version = "3.0.1215"
|
version = "3.0.1478"
|
||||||
source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" }
|
source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" }
|
||||||
dependencies = [
|
dependencies = [
|
||||||
{ name = "requests" },
|
{ name = "requests" },
|
||||||
]
|
]
|
||||||
sdist = { url = "https://pypi.tuna.tsinghua.edu.cn/packages/fd/4c/7a320c65d605e817bedd1205c77a612be7d4dde621182cc7c00e334207ce/tencentcloud-sdk-python-3.0.1215.tar.gz", hash = "sha256:24441e69d418301d50be0279cb148a747fc272b836e41d18e213750093f490c6", size = 9566281, upload-time = "2024-08-19T20:24:26.541Z" }
|
sdist = { url = "https://pypi.tuna.tsinghua.edu.cn/packages/3a/47/05163b257f6c0e60aed4272d48bdb816567ab3c805d3e8770430f0cc1be2/tencentcloud-sdk-python-3.0.1478.tar.gz", hash = "sha256:89996462d53a672946aa32d01673a4818ebcd8bc72b024f35ebe96cebe2df179", size = 12297889, upload_time = "2025-10-20T20:54:40.603Z" }
|
||||||
wheels = [
|
wheels = [
|
||||||
{ url = "https://pypi.tuna.tsinghua.edu.cn/packages/55/08/98090d1a139e8995053ed22e099b43aa4dea8cffe056f8f0bc5178aeecbd/tencentcloud_sdk_python-3.0.1215-py2.py3-none-any.whl", hash = "sha256:899ced749baf74846f1eabf452f74aa0e48d1965f0ca7828a8b73b446f76f5f2", size = 10265517, upload-time = "2024-08-19T20:24:19.52Z" },
|
{ url = "https://pypi.tuna.tsinghua.edu.cn/packages/c5/db/daa85799b9af2aa50539b27eeb0d6a2a0ac35465f62683107847830dbe4d/tencentcloud_sdk_python-3.0.1478-py2.py3-none-any.whl", hash = "sha256:10ddee1c1348f49e2b54af606f978d4cb17fca656639e8d99b6527e6e4793833", size = 12984723, upload_time = "2025-10-20T20:54:27.767Z" },
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
|
|||||||
@ -19,6 +19,7 @@ export const enum ParseDocumentType {
|
|||||||
PlainText = 'Plain Text',
|
PlainText = 'Plain Text',
|
||||||
MinerU = 'MinerU',
|
MinerU = 'MinerU',
|
||||||
Docling = 'Docling',
|
Docling = 'Docling',
|
||||||
|
TCADPParser = 'TCADP Parser',
|
||||||
}
|
}
|
||||||
|
|
||||||
export function LayoutRecognizeFormField({
|
export function LayoutRecognizeFormField({
|
||||||
@ -45,6 +46,7 @@ export function LayoutRecognizeFormField({
|
|||||||
ParseDocumentType.PlainText,
|
ParseDocumentType.PlainText,
|
||||||
ParseDocumentType.MinerU,
|
ParseDocumentType.MinerU,
|
||||||
ParseDocumentType.Docling,
|
ParseDocumentType.Docling,
|
||||||
|
ParseDocumentType.TCADPParser,
|
||||||
].map((x) => ({
|
].map((x) => ({
|
||||||
label: x === ParseDocumentType.PlainText ? t(camelCase(x)) : x,
|
label: x === ParseDocumentType.PlainText ? t(camelCase(x)) : x,
|
||||||
value: x,
|
value: x,
|
||||||
|
|||||||
@ -20,7 +20,8 @@ export function PdfFormFields({ prefix }: CommonProps) {
|
|||||||
return (
|
return (
|
||||||
!isEmpty(parseMethod) &&
|
!isEmpty(parseMethod) &&
|
||||||
parseMethod !== ParseDocumentType.DeepDOC &&
|
parseMethod !== ParseDocumentType.DeepDOC &&
|
||||||
parseMethod !== ParseDocumentType.PlainText
|
parseMethod !== ParseDocumentType.PlainText &&
|
||||||
|
parseMethod !== ParseDocumentType.TCADPParser
|
||||||
);
|
);
|
||||||
}, [parseMethod]);
|
}, [parseMethod]);
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user