Feat: Add TCADP parser for PPTX and spreadsheet document types. (#11041)

### What problem does this PR solve?

- Added TCADP Parser configuration fields to PDF, PPT, and spreadsheet
parsing forms
- Implemented support for setting table result type (Markdown/HTML) and
Markdown image response type (URL/Text)
- Updated TCADP Parser to handle return format settings from
configuration or parameters
- Enhanced frontend to dynamically show TCADP options based on selected
parsing method
- Modified backend to pass format parameters when calling TCADP API
- Optimized form default value logic for TCADP configuration items
- Updated multilingual resource files for new configuration options

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
This commit is contained in:
aidan
2025-11-20 10:08:42 +08:00
committed by GitHub
parent ecf0322165
commit 420c97199a
18 changed files with 668 additions and 37 deletions

View File

@ -192,12 +192,16 @@ class TencentCloudAPIClient:
class TCADPParser(RAGFlowPdfParser):
def __init__(self, secret_id: str = None, secret_key: str = None, region: str = "ap-guangzhou"):
def __init__(self, secret_id: str = None, secret_key: str = None, region: str = "ap-guangzhou",
table_result_type: str = None, markdown_image_response_type: str = None):
super().__init__()
# First initialize logger
self.logger = logging.getLogger(self.__class__.__name__)
# Log received parameters
self.logger.info(f"[TCADP] Initializing with parameters - table_result_type: {table_result_type}, markdown_image_response_type: {markdown_image_response_type}")
# Priority: read configuration from RAGFlow configuration system (service_conf.yaml)
try:
tcadp_parser = get_base_config("tcadp_config", {})
@ -205,14 +209,30 @@ class TCADPParser(RAGFlowPdfParser):
self.secret_id = secret_id or tcadp_parser.get("secret_id")
self.secret_key = secret_key or tcadp_parser.get("secret_key")
self.region = region or tcadp_parser.get("region", "ap-guangzhou")
self.table_result_type = tcadp_parser.get("table_result_type", "1")
self.markdown_image_response_type = tcadp_parser.get("markdown_image_response_type", "1")
self.logger.info("[TCADP] Configuration read from service_conf.yaml")
# Set table_result_type and markdown_image_response_type from config or parameters
self.table_result_type = table_result_type if table_result_type is not None else tcadp_parser.get("table_result_type", "1")
self.markdown_image_response_type = markdown_image_response_type if markdown_image_response_type is not None else tcadp_parser.get("markdown_image_response_type", "1")
else:
self.logger.error("[TCADP] Please configure tcadp_config in service_conf.yaml first")
# If config file is empty, use provided parameters or defaults
self.secret_id = secret_id
self.secret_key = secret_key
self.region = region or "ap-guangzhou"
self.table_result_type = table_result_type if table_result_type is not None else "1"
self.markdown_image_response_type = markdown_image_response_type if markdown_image_response_type is not None else "1"
except ImportError:
self.logger.info("[TCADP] Configuration module import failed")
# If config file is not available, use provided parameters or defaults
self.secret_id = secret_id
self.secret_key = secret_key
self.region = region or "ap-guangzhou"
self.table_result_type = table_result_type if table_result_type is not None else "1"
self.markdown_image_response_type = markdown_image_response_type if markdown_image_response_type is not None else "1"
# Log final values
self.logger.info(f"[TCADP] Final values - table_result_type: {self.table_result_type}, markdown_image_response_type: {self.markdown_image_response_type}")
if not self.secret_id or not self.secret_key:
raise ValueError("[TCADP] Please set Tencent Cloud API keys, configure tcadp_config in service_conf.yaml")
@ -400,6 +420,8 @@ class TCADPParser(RAGFlowPdfParser):
"TableResultType": self.table_result_type,
"MarkdownImageResponseType": self.markdown_image_response_type
}
self.logger.info(f"[TCADP] API request config - TableResultType: {self.table_result_type}, MarkdownImageResponseType: {self.markdown_image_response_type}")
result = client.reconstruct_document_sse(
file_type=file_type,