diff --git a/conf/service_conf.yaml b/conf/service_conf.yaml index 384197bbf..15b7c9d06 100644 --- a/conf/service_conf.yaml +++ b/conf/service_conf.yaml @@ -133,3 +133,9 @@ user_default_llm: # - "RAGFlow" # display name # - "" # sender email address # mail_frontend_url: "https://your-frontend.example.com" +# tcadp_config: +# secret_id: 'tencent_secret_id' +# secret_key: 'tencent_secret_key' +# region: 'tencent_region' +# table_result_type: '1' +# markdown_image_response_type: '1' diff --git a/deepdoc/parser/tcadp_parser.py b/deepdoc/parser/tcadp_parser.py new file mode 100644 index 000000000..f84a0e6a4 --- /dev/null +++ b/deepdoc/parser/tcadp_parser.py @@ -0,0 +1,504 @@ +# +# Copyright 2025 The InfiniFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +import base64 +import json +import logging +import os +import shutil +import tempfile +import time +import traceback +import types +import zipfile +from datetime import datetime +from io import BytesIO +from os import PathLike +from pathlib import Path +from typing import Any, Callable, Optional + +import requests +from tencentcloud.common import credential +from tencentcloud.common.profile.client_profile import ClientProfile +from tencentcloud.common.profile.http_profile import HttpProfile +from tencentcloud.common.exception.tencent_cloud_sdk_exception import TencentCloudSDKException +from tencentcloud.lkeap.v20240522 import lkeap_client, models + +from api.utils.configs import get_base_config +from deepdoc.parser.pdf_parser import RAGFlowPdfParser + + +class TencentCloudAPIClient: + """Tencent Cloud API client using official SDK""" + + def __init__(self, secret_id, secret_key, region): + self.secret_id = secret_id + self.secret_key = secret_key + self.region = region + + # Create credentials + self.cred = credential.Credential(secret_id, secret_key) + + # Instantiate an http option, optional, can be skipped if no special requirements + self.httpProfile = HttpProfile() + self.httpProfile.endpoint = "lkeap.tencentcloudapi.com" + + # Instantiate a client option, optional, can be skipped if no special requirements + self.clientProfile = ClientProfile() + self.clientProfile.httpProfile = self.httpProfile + + # Instantiate the client object for the product to be requested, clientProfile is optional + self.client = lkeap_client.LkeapClient(self.cred, region, self.clientProfile) + + def reconstruct_document_sse(self, file_type, file_url=None, file_base64=None, file_start_page=1, file_end_page=1000, config=None): + """Call document parsing API using official SDK""" + try: + # Instantiate a request object, each interface corresponds to a request object + req = models.ReconstructDocumentSSERequest() + + # Build request parameters + params = { + "FileType": file_type, + "FileStartPageNumber": file_start_page, + "FileEndPageNumber": file_end_page, + } + + # According to Tencent Cloud API documentation, either FileUrl or FileBase64 parameter must be provided, if both are provided only FileUrl will be used + if file_url: + params["FileUrl"] = file_url + logging.info(f"[TCADP] Using file URL: {file_url}") + elif file_base64: + params["FileBase64"] = file_base64 + logging.info(f"[TCADP] Using Base64 data, length: {len(file_base64)} characters") + else: + raise ValueError("Must provide either FileUrl or FileBase64 parameter") + + if config: + params["Config"] = config + + req.from_json_string(json.dumps(params)) + + # The returned resp is an instance of ReconstructDocumentSSEResponse, corresponding to the request object + resp = self.client.ReconstructDocumentSSE(req) + parser_result = {} + + # Output json format string response + if isinstance(resp, types.GeneratorType): # Streaming response + logging.info("[TCADP] Detected streaming response") + for event in resp: + logging.info(f"[TCADP] Received event: {event}") + if event.get('data'): + try: + data_dict = json.loads(event['data']) + logging.info(f"[TCADP] Parsed data: {data_dict}") + + if data_dict.get('Progress') == "100": + parser_result = data_dict + logging.info("[TCADP] Document parsing completed!") + logging.info(f"[TCADP] Task ID: {data_dict.get('TaskId')}") + logging.info(f"[TCADP] Success pages: {data_dict.get('SuccessPageNum')}") + logging.info(f"[TCADP] Failed pages: {data_dict.get('FailPageNum')}") + + # Print failed page information + failed_pages = data_dict.get("FailedPages", []) + if failed_pages: + logging.warning("[TCADP] Failed parsing pages:") + for page in failed_pages: + logging.warning(f"[TCADP] Page number: {page.get('PageNumber')}, Error: {page.get('ErrorMsg')}") + + # Check if there is a download link + download_url = data_dict.get("DocumentRecognizeResultUrl") + if download_url: + logging.info(f"[TCADP] Got download link: {download_url}") + else: + logging.warning("[TCADP] No download link obtained") + + break # Found final result, exit loop + else: + # Print progress information + progress = data_dict.get("Progress", "0") + logging.info(f"[TCADP] Progress: {progress}%") + except json.JSONDecodeError as e: + logging.error(f"[TCADP] Failed to parse JSON data: {e}") + logging.error(f"[TCADP] Raw data: {event.get('data')}") + continue + else: + logging.info(f"[TCADP] Event without data: {event}") + else: # Non-streaming response + logging.info("[TCADP] Detected non-streaming response") + if hasattr(resp, 'data') and resp.data: + try: + data_dict = json.loads(resp.data) + parser_result = data_dict + logging.info(f"[TCADP] JSON parsing successful: {parser_result}") + except json.JSONDecodeError as e: + logging.error(f"[TCADP] JSON parsing failed: {e}") + return None + else: + logging.error("[TCADP] No data in response") + return None + + return parser_result + + except TencentCloudSDKException as err: + logging.error(f"[TCADP] Tencent Cloud SDK error: {err}") + return None + except Exception as e: + logging.error(f"[TCADP] Unknown error: {e}") + logging.error(f"[TCADP] Error stack trace: {traceback.format_exc()}") + return None + + def download_result_file(self, download_url, output_dir): + """Download parsing result file""" + if not download_url: + logging.warning("[TCADP] No downloadable result file") + return None + + try: + response = requests.get(download_url) + response.raise_for_status() + + # Ensure output directory exists + os.makedirs(output_dir, exist_ok=True) + + # Generate filename + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + filename = f"tcadp_result_{timestamp}.zip" + file_path = os.path.join(output_dir, filename) + + # Save file + with open(file_path, "wb") as f: + f.write(response.content) + + logging.info(f"[TCADP] Document parsing result downloaded to: {os.path.basename(file_path)}") + return file_path + + except requests.exceptions.RequestException as e: + logging.error(f"[TCADP] Failed to download file: {e}") + return None + + +class TCADPParser(RAGFlowPdfParser): + def __init__(self, secret_id: str = None, secret_key: str = None, region: str = "ap-guangzhou"): + super().__init__() + + # First initialize logger + self.logger = logging.getLogger(self.__class__.__name__) + + # Priority: read configuration from RAGFlow configuration system (service_conf.yaml) + try: + tcadp_parser = get_base_config("tcadp_config", {}) + if isinstance(tcadp_parser, dict) and tcadp_parser: + self.secret_id = secret_id or tcadp_parser.get("secret_id") + self.secret_key = secret_key or tcadp_parser.get("secret_key") + self.region = region or tcadp_parser.get("region", "ap-guangzhou") + self.table_result_type = tcadp_parser.get("table_result_type", "1") + self.markdown_image_response_type = tcadp_parser.get("markdown_image_response_type", "1") + self.logger.info("[TCADP] Configuration read from service_conf.yaml") + else: + self.logger.error("[TCADP] Please configure tcadp_config in service_conf.yaml first") + + except ImportError: + self.logger.info("[TCADP] Configuration module import failed") + + if not self.secret_id or not self.secret_key: + raise ValueError("[TCADP] Please set Tencent Cloud API keys, configure tcadp_config in service_conf.yaml") + + def check_installation(self) -> bool: + """Check if Tencent Cloud API configuration is correct""" + try: + # Check necessary configuration parameters + if not self.secret_id or not self.secret_key: + self.logger.error("[TCADP] Tencent Cloud API configuration incomplete") + return False + + # Try to create client to verify configuration + TencentCloudAPIClient(self.secret_id, self.secret_key, self.region) + self.logger.info("[TCADP] Tencent Cloud API configuration check passed") + return True + except Exception as e: + self.logger.error(f"[TCADP] Tencent Cloud API configuration check failed: {e}") + return False + + def _file_to_base64(self, file_path: str, binary: bytes = None) -> str: + """Convert file to Base64 format""" + + if binary: + # If binary data is directly available, convert directly + return base64.b64encode(binary).decode('utf-8') + else: + # Read from file path and convert + with open(file_path, 'rb') as f: + file_data = f.read() + return base64.b64encode(file_data).decode('utf-8') + + def _extract_content_from_zip(self, zip_path: str) -> list[dict[str, Any]]: + """Extract parsing results from downloaded ZIP file""" + results = [] + + try: + with zipfile.ZipFile(zip_path, "r") as zip_file: + # Find JSON result files + json_files = [f for f in zip_file.namelist() if f.endswith(".json")] + + for json_file in json_files: + with zip_file.open(json_file) as f: + data = json.load(f) + if isinstance(data, list): + results.extend(data) + else: + results.append(data) + + # Find Markdown files + md_files = [f for f in zip_file.namelist() if f.endswith(".md")] + for md_file in md_files: + with zip_file.open(md_file) as f: + content = f.read().decode("utf-8") + results.append({"type": "text", "content": content, "file": md_file}) + + except Exception as e: + self.logger.error(f"[TCADP] Failed to extract ZIP file content: {e}") + + return results + + def _parse_content_to_sections(self, content_data: list[dict[str, Any]]) -> list[tuple[str, str]]: + """Convert parsing results to sections format""" + sections = [] + + for item in content_data: + content_type = item.get("type", "text") + content = item.get("content", "") + + if not content: + continue + + # Process based on content type + if content_type == "text" or content_type == "paragraph": + section_text = content + elif content_type == "table": + # Handle table content + table_data = item.get("table_data", {}) + if isinstance(table_data, dict): + # Convert table data to text + rows = table_data.get("rows", []) + section_text = "\n".join([" | ".join(row) for row in rows]) + else: + section_text = str(table_data) + elif content_type == "image": + # Handle image content + caption = item.get("caption", "") + section_text = f"[Image] {caption}" if caption else "[Image]" + elif content_type == "equation": + # Handle equation content + section_text = f"$${content}$$" + else: + section_text = content + + if section_text.strip(): + # Generate position tag (simplified version) + position_tag = "@@1\t0.0\t1000.0\t0.0\t100.0##" + sections.append((section_text, position_tag)) + + return sections + + def _parse_content_to_tables(self, content_data: list[dict[str, Any]]) -> list: + """Convert parsing results to tables format""" + tables = [] + + for item in content_data: + if item.get("type") == "table": + table_data = item.get("table_data", {}) + if isinstance(table_data, dict): + rows = table_data.get("rows", []) + if rows: + # Convert to table format + table_html = "\n" + for i, row in enumerate(rows): + table_html += " \n" + for cell in row: + tag = "th" if i == 0 else "td" + table_html += f" <{tag}>{cell}\n" + table_html += " \n" + table_html += "
" + tables.append(table_html) + + return tables + + def parse_pdf( + self, + filepath: str | PathLike[str], + binary: BytesIO | bytes, + callback: Optional[Callable] = None, + *, + output_dir: Optional[str] = None, + file_type: str = "PDF", + file_start_page: Optional[int] = 1, + file_end_page: Optional[int] = 1000, + delete_output: Optional[bool] = True, + max_retries: Optional[int] = 1, + ) -> tuple: + """Parse PDF document""" + + temp_file = None + created_tmp_dir = False + + try: + # Handle input file + if binary: + temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") + temp_file.write(binary) + temp_file.close() + file_path = temp_file.name + self.logger.info(f"[TCADP] Received binary PDF -> {os.path.basename(file_path)}") + if callback: + callback(0.1, f"[TCADP] Received binary PDF -> {os.path.basename(file_path)}") + else: + file_path = str(filepath) + if not os.path.exists(file_path): + if callback: + callback(-1, f"[TCADP] PDF file does not exist: {file_path}") + raise FileNotFoundError(f"[TCADP] PDF file does not exist: {file_path}") + + # Convert file to Base64 format + if callback: + callback(0.2, "[TCADP] Converting file to Base64 format") + + file_base64 = self._file_to_base64(file_path, binary) + if callback: + callback(0.25, f"[TCADP] File converted to Base64, size: {len(file_base64)} characters") + + # Create Tencent Cloud API client + client = TencentCloudAPIClient(self.secret_id, self.secret_key, self.region) + + # Call document parsing API (with retry mechanism) + if callback: + callback(0.3, "[TCADP] Starting to call Tencent Cloud document parsing API") + + result = None + for attempt in range(max_retries): + try: + if attempt > 0: + self.logger.info(f"[TCADP] Retry attempt {attempt + 1}") + if callback: + callback(0.3 + attempt * 0.1, f"[TCADP] Retry attempt {attempt + 1}") + time.sleep(2 ** attempt) # Exponential backoff + + config = { + "TableResultType": self.table_result_type, + "MarkdownImageResponseType": self.markdown_image_response_type + } + + result = client.reconstruct_document_sse( + file_type=file_type, + file_base64=file_base64, + file_start_page=file_start_page, + file_end_page=file_end_page, + config=config + ) + + if result: + self.logger.info(f"[TCADP] Attempt {attempt + 1} successful") + break + else: + self.logger.warning(f"[TCADP] Attempt {attempt + 1} failed, result is None") + + except Exception as e: + self.logger.error(f"[TCADP] Attempt {attempt + 1} exception: {e}") + if attempt == max_retries - 1: + raise + + if not result: + error_msg = f"[TCADP] Document parsing failed, retried {max_retries} times" + self.logger.error(error_msg) + if callback: + callback(-1, error_msg) + raise RuntimeError(error_msg) + + # Get download link + download_url = result.get("DocumentRecognizeResultUrl") + if not download_url: + if callback: + callback(-1, "[TCADP] No parsing result download link obtained") + raise RuntimeError("[TCADP] No parsing result download link obtained") + + if callback: + callback(0.6, f"[TCADP] Parsing result download link: {download_url}") + + # Set output directory + if output_dir: + out_dir = Path(output_dir) + out_dir.mkdir(parents=True, exist_ok=True) + else: + out_dir = Path(tempfile.mkdtemp(prefix="adp_pdf_")) + created_tmp_dir = True + + # Download result file + zip_path = client.download_result_file(download_url, str(out_dir)) + if not zip_path: + if callback: + callback(-1, "[TCADP] Failed to download parsing result") + raise RuntimeError("[TCADP] Failed to download parsing result") + + if callback: + # Shorten file path display, only show filename + zip_filename = os.path.basename(zip_path) + callback(0.8, f"[TCADP] Parsing result downloaded: {zip_filename}") + + # Extract ZIP file content + content_data = self._extract_content_from_zip(zip_path) + self.logger.info(f"[TCADP] Extracted {len(content_data)} content blocks") + + if callback: + callback(0.9, f"[TCADP] Extracted {len(content_data)} content blocks") + + # Convert to sections and tables format + sections = self._parse_content_to_sections(content_data) + tables = self._parse_content_to_tables(content_data) + + self.logger.info(f"[TCADP] Parsing completed: {len(sections)} sections, {len(tables)} tables") + + if callback: + callback(1.0, f"[TCADP] Parsing completed: {len(sections)} sections, {len(tables)} tables") + + return sections, tables + + finally: + # Clean up temporary files + if temp_file and os.path.exists(temp_file.name): + try: + os.unlink(temp_file.name) + except Exception: + pass + + if delete_output and created_tmp_dir and out_dir.exists(): + try: + shutil.rmtree(out_dir) + except Exception: + pass + + +if __name__ == "__main__": + # Test ADP parser + parser = TCADPParser() + print("ADP available:", parser.check_installation()) + + # Test parsing + filepath = "" + if filepath and os.path.exists(filepath): + with open(filepath, "rb") as file: + sections, tables = parser.parse_pdf(filepath=filepath, binary=file.read()) + print(f"Parsing result: {len(sections)} sections, {len(tables)} tables") + for i, (section, tag) in enumerate(sections[:3]): # Only print first 3 + print(f"Section {i + 1}: {section[:100]}...") diff --git a/docker/service_conf.yaml.template b/docker/service_conf.yaml.template index f49ab4f67..dbefe053a 100644 --- a/docker/service_conf.yaml.template +++ b/docker/service_conf.yaml.template @@ -138,3 +138,9 @@ user_default_llm: # - "RAGFlow" # display name # - "" # sender email address # mail_frontend_url: "https://your-frontend.example.com" +# tcadp_config: +# secret_id: '${TENCENT_SECRET_ID}' +# secret_key: '${TENCENT_SECRET_KEY}' +# region: '${TENCENT_REGION}' +# table_result_type: '1' +# markdown_image_response_type: '1' diff --git a/pyproject.toml b/pyproject.toml index 1edbeccd0..f386cfe88 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -98,7 +98,7 @@ dependencies = [ "strenum==0.4.15", "tabulate==0.9.0", "tavily-python==0.5.1", - "tencentcloud-sdk-python==3.0.1215", + "tencentcloud-sdk-python==3.0.1478", "tika==2.6.0", "tiktoken==0.7.0", "umap_learn==0.5.6", diff --git a/rag/app/naive.py b/rag/app/naive.py index 7497a04e1..29eef53b4 100644 --- a/rag/app/naive.py +++ b/rag/app/naive.py @@ -36,6 +36,7 @@ from deepdoc.parser.figure_parser import VisionFigureParser,vision_figure_parser from deepdoc.parser.pdf_parser import PlainParser, VisionParser from deepdoc.parser.mineru_parser import MinerUParser from deepdoc.parser.docling_parser import DoclingParser +from deepdoc.parser.tcadp_parser import TCADPParser from rag.nlp import concat_img, find_codec, naive_merge, naive_merge_with_images, naive_merge_docx, rag_tokenizer, tokenize_chunks, tokenize_chunks_with_images, tokenize_table @@ -550,7 +551,23 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, parser_config["chunk_token_num"] = 0 res = tokenize_table(tables, doc, is_english) callback(0.8, "Finish parsing.") - + + + elif layout_recognizer == "TCADP Parser": + tcadp_parser = TCADPParser() + if not tcadp_parser.check_installation(): + callback(-1, "TCADP parser not available. Please check Tencent Cloud API configuration.") + return res + + sections, tables = tcadp_parser.parse_pdf( + filepath=filename, + binary=binary, + callback=callback, + output_dir=os.environ.get("TCADP_OUTPUT_DIR", ""), + file_type="PDF" + ) + parser_config["chunk_token_num"] = 0 + callback(0.8, "Finish parsing.") else: if layout_recognizer == "Plain Text": pdf_parser = PlainParser() diff --git a/rag/flow/parser/parser.py b/rag/flow/parser/parser.py index bff05d084..1212bd38f 100644 --- a/rag/flow/parser/parser.py +++ b/rag/flow/parser/parser.py @@ -31,6 +31,7 @@ from api.utils.base64_image import image2id from deepdoc.parser import ExcelParser from deepdoc.parser.mineru_parser import MinerUParser from deepdoc.parser.pdf_parser import PlainParser, RAGFlowPdfParser, VisionParser +from deepdoc.parser.tcadp_parser import TCADPParser from rag.app.naive import Docx from rag.flow.base import ProcessBase, ProcessParamBase from rag.flow.parser.schema import ParserFromUpstream @@ -74,7 +75,7 @@ class ParserParam(ProcessParamBase): self.setups = { "pdf": { - "parse_method": "deepdoc", # deepdoc/plain_text/vlm + "parse_method": "deepdoc", # deepdoc/plain_text/tcadp_parser/vlm "lang": "Chinese", "suffix": [ "pdf", @@ -157,7 +158,7 @@ class ParserParam(ProcessParamBase): pdf_parse_method = pdf_config.get("parse_method", "") self.check_empty(pdf_parse_method, "Parse method abnormal.") - if pdf_parse_method.lower() not in ["deepdoc", "plain_text", "mineru"]: + if pdf_parse_method.lower() not in ["deepdoc", "plain_text", "mineru", "tcadp parser"]: self.check_empty(pdf_config.get("lang", ""), "PDF VLM language") pdf_output_format = pdf_config.get("output_format", "") @@ -240,6 +241,39 @@ class Parser(ProcessBase): "text": t, } bboxes.append(box) + elif conf.get("parse_method").lower() == "tcadp parser": + # ADP is a document parsing tool using Tencent Cloud API + tcadp_parser = TCADPParser() + sections, _ = tcadp_parser.parse_pdf( + filepath=name, + binary=blob, + callback=self.callback, + file_type="PDF", + file_start_page=1, + file_end_page=1000 + ) + bboxes = [] + for section, position_tag in sections: + if position_tag: + # Extract position information from TCADP's position tag + # Format: @@{page_number}\t{x0}\t{x1}\t{top}\t{bottom}## + import re + match = re.match(r"@@([0-9-]+)\t([0-9.]+)\t([0-9.]+)\t([0-9.]+)\t([0-9.]+)##", position_tag) + if match: + pn, x0, x1, top, bott = match.groups() + bboxes.append({ + "page_number": int(pn.split('-')[0]), # Take the first page number + "x0": float(x0), + "x1": float(x1), + "top": float(top), + "bottom": float(bott), + "text": section + }) + else: + # If no position info, add as text without position + bboxes.append({"text": section}) + else: + bboxes.append({"text": section}) else: vision_model = LLMBundle(self._canvas._tenant_id, LLMType.IMAGE2TEXT, llm_name=conf.get("parse_method"), lang=self._param.setups["pdf"].get("lang")) lines, _ = VisionParser(vision_model=vision_model)(blob, callback=self.callback) diff --git a/uv.lock b/uv.lock index 7bc027e67..79d7e4527 100644 --- a/uv.lock +++ b/uv.lock @@ -5450,7 +5450,7 @@ requires-dist = [ { name = "strenum", specifier = "==0.4.15" }, { name = "tabulate", specifier = "==0.9.0" }, { name = "tavily-python", specifier = "==0.5.1" }, - { name = "tencentcloud-sdk-python", specifier = "==3.0.1215" }, + { name = "tencentcloud-sdk-python", specifier = "==3.0.1478" }, { name = "tika", specifier = "==2.6.0" }, { name = "tiktoken", specifier = "==0.7.0" }, { name = "trio", specifier = ">=0.29.0" }, @@ -6508,14 +6508,14 @@ wheels = [ [[package]] name = "tencentcloud-sdk-python" -version = "3.0.1215" +version = "3.0.1478" source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" } dependencies = [ { name = "requests" }, ] -sdist = { url = "https://pypi.tuna.tsinghua.edu.cn/packages/fd/4c/7a320c65d605e817bedd1205c77a612be7d4dde621182cc7c00e334207ce/tencentcloud-sdk-python-3.0.1215.tar.gz", hash = "sha256:24441e69d418301d50be0279cb148a747fc272b836e41d18e213750093f490c6", size = 9566281, upload-time = "2024-08-19T20:24:26.541Z" } +sdist = { url = "https://pypi.tuna.tsinghua.edu.cn/packages/3a/47/05163b257f6c0e60aed4272d48bdb816567ab3c805d3e8770430f0cc1be2/tencentcloud-sdk-python-3.0.1478.tar.gz", hash = "sha256:89996462d53a672946aa32d01673a4818ebcd8bc72b024f35ebe96cebe2df179", size = 12297889, upload_time = "2025-10-20T20:54:40.603Z" } wheels = [ - { url = "https://pypi.tuna.tsinghua.edu.cn/packages/55/08/98090d1a139e8995053ed22e099b43aa4dea8cffe056f8f0bc5178aeecbd/tencentcloud_sdk_python-3.0.1215-py2.py3-none-any.whl", hash = "sha256:899ced749baf74846f1eabf452f74aa0e48d1965f0ca7828a8b73b446f76f5f2", size = 10265517, upload-time = "2024-08-19T20:24:19.52Z" }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/c5/db/daa85799b9af2aa50539b27eeb0d6a2a0ac35465f62683107847830dbe4d/tencentcloud_sdk_python-3.0.1478-py2.py3-none-any.whl", hash = "sha256:10ddee1c1348f49e2b54af606f978d4cb17fca656639e8d99b6527e6e4793833", size = 12984723, upload_time = "2025-10-20T20:54:27.767Z" }, ] [[package]] diff --git a/web/src/components/layout-recognize-form-field.tsx b/web/src/components/layout-recognize-form-field.tsx index fb4935f33..43f0abccb 100644 --- a/web/src/components/layout-recognize-form-field.tsx +++ b/web/src/components/layout-recognize-form-field.tsx @@ -19,6 +19,7 @@ export const enum ParseDocumentType { PlainText = 'Plain Text', MinerU = 'MinerU', Docling = 'Docling', + TCADPParser = 'TCADP Parser', } export function LayoutRecognizeFormField({ @@ -45,6 +46,7 @@ export function LayoutRecognizeFormField({ ParseDocumentType.PlainText, ParseDocumentType.MinerU, ParseDocumentType.Docling, + ParseDocumentType.TCADPParser, ].map((x) => ({ label: x === ParseDocumentType.PlainText ? t(camelCase(x)) : x, value: x, diff --git a/web/src/pages/agent/form/parser-form/pdf-form-fields.tsx b/web/src/pages/agent/form/parser-form/pdf-form-fields.tsx index d6c3eb7f4..020032c5c 100644 --- a/web/src/pages/agent/form/parser-form/pdf-form-fields.tsx +++ b/web/src/pages/agent/form/parser-form/pdf-form-fields.tsx @@ -20,7 +20,8 @@ export function PdfFormFields({ prefix }: CommonProps) { return ( !isEmpty(parseMethod) && parseMethod !== ParseDocumentType.DeepDOC && - parseMethod !== ParseDocumentType.PlainText + parseMethod !== ParseDocumentType.PlainText && + parseMethod !== ParseDocumentType.TCADPParser ); }, [parseMethod]); diff --git a/web/src/pages/data-flow/form/parser-form/pdf-form-fields.tsx b/web/src/pages/data-flow/form/parser-form/pdf-form-fields.tsx new file mode 100644 index 000000000..e69de29bb