mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-02-02 08:35:08 +08:00
Fix: zip extraction vulnerabilities in MinerU and TCADP (#12527)
### What problem does this PR solve? Fix zip extraction vulnerabilities: - Block symlink entries in zip files. - Reject encrypted zip entries. - Prevent absolute path attacks (including Windows paths). - Block path traversal attempts (../). - Stop zip slip exploits (directory escape). - Use streaming for memory-safe file handling. ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue)
This commit is contained in:
@ -17,6 +17,7 @@ import json
|
|||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
|
import shutil
|
||||||
import sys
|
import sys
|
||||||
import tempfile
|
import tempfile
|
||||||
import threading
|
import threading
|
||||||
@ -138,39 +139,58 @@ class MinerUParser(RAGFlowPdfParser):
|
|||||||
self.outlines = []
|
self.outlines = []
|
||||||
self.logger = logging.getLogger(self.__class__.__name__)
|
self.logger = logging.getLogger(self.__class__.__name__)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _is_zipinfo_symlink(member: zipfile.ZipInfo) -> bool:
|
||||||
|
return (member.external_attr >> 16) & 0o170000 == 0o120000
|
||||||
|
|
||||||
def _extract_zip_no_root(self, zip_path, extract_to, root_dir):
|
def _extract_zip_no_root(self, zip_path, extract_to, root_dir):
|
||||||
self.logger.info(f"[MinerU] Extract zip: zip_path={zip_path}, extract_to={extract_to}, root_hint={root_dir}")
|
self.logger.info(f"[MinerU] Extract zip: zip_path={zip_path}, extract_to={extract_to}, root_hint={root_dir}")
|
||||||
|
base_dir = Path(extract_to).resolve()
|
||||||
with zipfile.ZipFile(zip_path, "r") as zip_ref:
|
with zipfile.ZipFile(zip_path, "r") as zip_ref:
|
||||||
|
members = zip_ref.infolist()
|
||||||
if not root_dir:
|
if not root_dir:
|
||||||
files = zip_ref.namelist()
|
if members and members[0].filename.endswith("/"):
|
||||||
if files and files[0].endswith("/"):
|
root_dir = members[0].filename
|
||||||
root_dir = files[0]
|
|
||||||
else:
|
else:
|
||||||
root_dir = None
|
root_dir = None
|
||||||
|
if root_dir:
|
||||||
|
root_dir = root_dir.replace("\\", "/")
|
||||||
|
if not root_dir.endswith("/"):
|
||||||
|
root_dir += "/"
|
||||||
|
|
||||||
if not root_dir or not root_dir.endswith("/"):
|
for member in members:
|
||||||
self.logger.info(f"[MinerU] No root directory found, extracting all (root_hint={root_dir})")
|
if member.flag_bits & 0x1:
|
||||||
zip_ref.extractall(extract_to)
|
raise RuntimeError(f"[MinerU] Encrypted zip entry not supported: {member.filename}")
|
||||||
return
|
if self._is_zipinfo_symlink(member):
|
||||||
|
raise RuntimeError(f"[MinerU] Symlink zip entry not supported: {member.filename}")
|
||||||
|
|
||||||
root_len = len(root_dir)
|
name = member.filename.replace("\\", "/")
|
||||||
for member in zip_ref.infolist():
|
if root_dir and name == root_dir:
|
||||||
filename = member.filename
|
|
||||||
if filename == root_dir:
|
|
||||||
self.logger.info("[MinerU] Ignore root folder...")
|
self.logger.info("[MinerU] Ignore root folder...")
|
||||||
continue
|
continue
|
||||||
|
if root_dir and name.startswith(root_dir):
|
||||||
|
name = name[len(root_dir) :]
|
||||||
|
if not name:
|
||||||
|
continue
|
||||||
|
if name.startswith("/") or name.startswith("//") or re.match(r"^[A-Za-z]:", name):
|
||||||
|
raise RuntimeError(f"[MinerU] Unsafe zip path (absolute): {member.filename}")
|
||||||
|
|
||||||
path = filename
|
parts = [p for p in name.split("/") if p not in ("", ".")]
|
||||||
if path.startswith(root_dir):
|
if any(p == ".." for p in parts):
|
||||||
path = path[root_len:]
|
raise RuntimeError(f"[MinerU] Unsafe zip path (traversal): {member.filename}")
|
||||||
|
|
||||||
|
rel_path = os.path.join(*parts) if parts else ""
|
||||||
|
dest_path = (Path(extract_to) / rel_path).resolve(strict=False)
|
||||||
|
if dest_path != base_dir and base_dir not in dest_path.parents:
|
||||||
|
raise RuntimeError(f"[MinerU] Unsafe zip path (escape): {member.filename}")
|
||||||
|
|
||||||
full_path = os.path.join(extract_to, path)
|
|
||||||
if member.is_dir():
|
if member.is_dir():
|
||||||
os.makedirs(full_path, exist_ok=True)
|
os.makedirs(dest_path, exist_ok=True)
|
||||||
else:
|
continue
|
||||||
os.makedirs(os.path.dirname(full_path), exist_ok=True)
|
|
||||||
with open(full_path, "wb") as f:
|
os.makedirs(dest_path.parent, exist_ok=True)
|
||||||
f.write(zip_ref.read(filename))
|
with zip_ref.open(member) as src, open(dest_path, "wb") as dst:
|
||||||
|
shutil.copyfileobj(src, dst)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _is_http_endpoint_valid(url, timeout=5):
|
def _is_http_endpoint_valid(url, timeout=5):
|
||||||
@ -237,8 +257,6 @@ class MinerUParser(RAGFlowPdfParser):
|
|||||||
output_path = tempfile.mkdtemp(prefix=f"{pdf_file_name}_{options.method}_", dir=str(output_dir))
|
output_path = tempfile.mkdtemp(prefix=f"{pdf_file_name}_{options.method}_", dir=str(output_dir))
|
||||||
output_zip_path = os.path.join(str(output_dir), f"{Path(output_path).name}.zip")
|
output_zip_path = os.path.join(str(output_dir), f"{Path(output_path).name}.zip")
|
||||||
|
|
||||||
files = {"files": (pdf_file_name + ".pdf", open(pdf_file_path, "rb"), "application/pdf")}
|
|
||||||
|
|
||||||
data = {
|
data = {
|
||||||
"output_dir": "./output",
|
"output_dir": "./output",
|
||||||
"lang_list": options.lang,
|
"lang_list": options.lang,
|
||||||
@ -270,26 +288,35 @@ class MinerUParser(RAGFlowPdfParser):
|
|||||||
self.logger.info(f"[MinerU] invoke api: {self.mineru_api}/file_parse backend={options.backend} server_url={data.get('server_url')}")
|
self.logger.info(f"[MinerU] invoke api: {self.mineru_api}/file_parse backend={options.backend} server_url={data.get('server_url')}")
|
||||||
if callback:
|
if callback:
|
||||||
callback(0.20, f"[MinerU] invoke api: {self.mineru_api}/file_parse")
|
callback(0.20, f"[MinerU] invoke api: {self.mineru_api}/file_parse")
|
||||||
response = requests.post(url=f"{self.mineru_api}/file_parse", files=files, data=data, headers=headers,
|
with open(pdf_file_path, "rb") as pdf_file:
|
||||||
timeout=1800)
|
files = {"files": (pdf_file_name + ".pdf", pdf_file, "application/pdf")}
|
||||||
|
with requests.post(
|
||||||
|
url=f"{self.mineru_api}/file_parse",
|
||||||
|
files=files,
|
||||||
|
data=data,
|
||||||
|
headers=headers,
|
||||||
|
timeout=1800,
|
||||||
|
stream=True,
|
||||||
|
) as response:
|
||||||
|
response.raise_for_status()
|
||||||
|
content_type = response.headers.get("Content-Type", "")
|
||||||
|
if content_type.startswith("application/zip"):
|
||||||
|
self.logger.info(f"[MinerU] zip file returned, saving to {output_zip_path}...")
|
||||||
|
|
||||||
response.raise_for_status()
|
if callback:
|
||||||
if response.headers.get("Content-Type") == "application/zip":
|
callback(0.30, f"[MinerU] zip file returned, saving to {output_zip_path}...")
|
||||||
self.logger.info(f"[MinerU] zip file returned, saving to {output_zip_path}...")
|
|
||||||
|
|
||||||
if callback:
|
with open(output_zip_path, "wb") as f:
|
||||||
callback(0.30, f"[MinerU] zip file returned, saving to {output_zip_path}...")
|
response.raw.decode_content = True
|
||||||
|
shutil.copyfileobj(response.raw, f)
|
||||||
|
|
||||||
with open(output_zip_path, "wb") as f:
|
self.logger.info(f"[MinerU] Unzip to {output_path}...")
|
||||||
f.write(response.content)
|
self._extract_zip_no_root(output_zip_path, output_path, pdf_file_name + "/")
|
||||||
|
|
||||||
self.logger.info(f"[MinerU] Unzip to {output_path}...")
|
if callback:
|
||||||
self._extract_zip_no_root(output_zip_path, output_path, pdf_file_name + "/")
|
callback(0.40, f"[MinerU] Unzip to {output_path}...")
|
||||||
|
else:
|
||||||
if callback:
|
self.logger.warning(f"[MinerU] not zip returned from api: {content_type}")
|
||||||
callback(0.40, f"[MinerU] Unzip to {output_path}...")
|
|
||||||
else:
|
|
||||||
self.logger.warning(f"[MinerU] not zip returned from api: {response.headers.get('Content-Type')}")
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
raise RuntimeError(f"[MinerU] api failed with exception {e}")
|
raise RuntimeError(f"[MinerU] api failed with exception {e}")
|
||||||
self.logger.info("[MinerU] Api completed successfully.")
|
self.logger.info("[MinerU] Api completed successfully.")
|
||||||
|
|||||||
@ -17,6 +17,7 @@ import base64
|
|||||||
import json
|
import json
|
||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
|
import re
|
||||||
import shutil
|
import shutil
|
||||||
import tempfile
|
import tempfile
|
||||||
import time
|
import time
|
||||||
@ -48,10 +49,10 @@ class TencentCloudAPIClient:
|
|||||||
self.secret_key = secret_key
|
self.secret_key = secret_key
|
||||||
self.region = region
|
self.region = region
|
||||||
self.outlines = []
|
self.outlines = []
|
||||||
|
|
||||||
# Create credentials
|
# Create credentials
|
||||||
self.cred = credential.Credential(secret_id, secret_key)
|
self.cred = credential.Credential(secret_id, secret_key)
|
||||||
|
|
||||||
# Instantiate an http option, optional, can be skipped if no special requirements
|
# Instantiate an http option, optional, can be skipped if no special requirements
|
||||||
self.httpProfile = HttpProfile()
|
self.httpProfile = HttpProfile()
|
||||||
self.httpProfile.endpoint = "lkeap.tencentcloudapi.com"
|
self.httpProfile.endpoint = "lkeap.tencentcloudapi.com"
|
||||||
@ -59,7 +60,7 @@ class TencentCloudAPIClient:
|
|||||||
# Instantiate a client option, optional, can be skipped if no special requirements
|
# Instantiate a client option, optional, can be skipped if no special requirements
|
||||||
self.clientProfile = ClientProfile()
|
self.clientProfile = ClientProfile()
|
||||||
self.clientProfile.httpProfile = self.httpProfile
|
self.clientProfile.httpProfile = self.httpProfile
|
||||||
|
|
||||||
# Instantiate the client object for the product to be requested, clientProfile is optional
|
# Instantiate the client object for the product to be requested, clientProfile is optional
|
||||||
self.client = lkeap_client.LkeapClient(self.cred, region, self.clientProfile)
|
self.client = lkeap_client.LkeapClient(self.cred, region, self.clientProfile)
|
||||||
|
|
||||||
@ -68,14 +69,14 @@ class TencentCloudAPIClient:
|
|||||||
try:
|
try:
|
||||||
# Instantiate a request object, each interface corresponds to a request object
|
# Instantiate a request object, each interface corresponds to a request object
|
||||||
req = models.ReconstructDocumentSSERequest()
|
req = models.ReconstructDocumentSSERequest()
|
||||||
|
|
||||||
# Build request parameters
|
# Build request parameters
|
||||||
params = {
|
params = {
|
||||||
"FileType": file_type,
|
"FileType": file_type,
|
||||||
"FileStartPageNumber": file_start_page,
|
"FileStartPageNumber": file_start_page,
|
||||||
"FileEndPageNumber": file_end_page,
|
"FileEndPageNumber": file_end_page,
|
||||||
}
|
}
|
||||||
|
|
||||||
# According to Tencent Cloud API documentation, either FileUrl or FileBase64 parameter must be provided, if both are provided only FileUrl will be used
|
# According to Tencent Cloud API documentation, either FileUrl or FileBase64 parameter must be provided, if both are provided only FileUrl will be used
|
||||||
if file_url:
|
if file_url:
|
||||||
params["FileUrl"] = file_url
|
params["FileUrl"] = file_url
|
||||||
@ -94,7 +95,7 @@ class TencentCloudAPIClient:
|
|||||||
# The returned resp is an instance of ReconstructDocumentSSEResponse, corresponding to the request object
|
# The returned resp is an instance of ReconstructDocumentSSEResponse, corresponding to the request object
|
||||||
resp = self.client.ReconstructDocumentSSE(req)
|
resp = self.client.ReconstructDocumentSSE(req)
|
||||||
parser_result = {}
|
parser_result = {}
|
||||||
|
|
||||||
# Output json format string response
|
# Output json format string response
|
||||||
if isinstance(resp, types.GeneratorType): # Streaming response
|
if isinstance(resp, types.GeneratorType): # Streaming response
|
||||||
logging.info("[TCADP] Detected streaming response")
|
logging.info("[TCADP] Detected streaming response")
|
||||||
@ -104,7 +105,7 @@ class TencentCloudAPIClient:
|
|||||||
try:
|
try:
|
||||||
data_dict = json.loads(event['data'])
|
data_dict = json.loads(event['data'])
|
||||||
logging.info(f"[TCADP] Parsed data: {data_dict}")
|
logging.info(f"[TCADP] Parsed data: {data_dict}")
|
||||||
|
|
||||||
if data_dict.get('Progress') == "100":
|
if data_dict.get('Progress') == "100":
|
||||||
parser_result = data_dict
|
parser_result = data_dict
|
||||||
logging.info("[TCADP] Document parsing completed!")
|
logging.info("[TCADP] Document parsing completed!")
|
||||||
@ -118,14 +119,14 @@ class TencentCloudAPIClient:
|
|||||||
logging.warning("[TCADP] Failed parsing pages:")
|
logging.warning("[TCADP] Failed parsing pages:")
|
||||||
for page in failed_pages:
|
for page in failed_pages:
|
||||||
logging.warning(f"[TCADP] Page number: {page.get('PageNumber')}, Error: {page.get('ErrorMsg')}")
|
logging.warning(f"[TCADP] Page number: {page.get('PageNumber')}, Error: {page.get('ErrorMsg')}")
|
||||||
|
|
||||||
# Check if there is a download link
|
# Check if there is a download link
|
||||||
download_url = data_dict.get("DocumentRecognizeResultUrl")
|
download_url = data_dict.get("DocumentRecognizeResultUrl")
|
||||||
if download_url:
|
if download_url:
|
||||||
logging.info(f"[TCADP] Got download link: {download_url}")
|
logging.info(f"[TCADP] Got download link: {download_url}")
|
||||||
else:
|
else:
|
||||||
logging.warning("[TCADP] No download link obtained")
|
logging.warning("[TCADP] No download link obtained")
|
||||||
|
|
||||||
break # Found final result, exit loop
|
break # Found final result, exit loop
|
||||||
else:
|
else:
|
||||||
# Print progress information
|
# Print progress information
|
||||||
@ -168,9 +169,6 @@ class TencentCloudAPIClient:
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
try:
|
try:
|
||||||
response = requests.get(download_url)
|
|
||||||
response.raise_for_status()
|
|
||||||
|
|
||||||
# Ensure output directory exists
|
# Ensure output directory exists
|
||||||
os.makedirs(output_dir, exist_ok=True)
|
os.makedirs(output_dir, exist_ok=True)
|
||||||
|
|
||||||
@ -179,29 +177,36 @@ class TencentCloudAPIClient:
|
|||||||
filename = f"tcadp_result_{timestamp}.zip"
|
filename = f"tcadp_result_{timestamp}.zip"
|
||||||
file_path = os.path.join(output_dir, filename)
|
file_path = os.path.join(output_dir, filename)
|
||||||
|
|
||||||
# Save file
|
with requests.get(download_url, stream=True) as response:
|
||||||
with open(file_path, "wb") as f:
|
response.raise_for_status()
|
||||||
f.write(response.content)
|
with open(file_path, "wb") as f:
|
||||||
|
response.raw.decode_content = True
|
||||||
|
shutil.copyfileobj(response.raw, f)
|
||||||
|
|
||||||
logging.info(f"[TCADP] Document parsing result downloaded to: {os.path.basename(file_path)}")
|
logging.info(f"[TCADP] Document parsing result downloaded to: {os.path.basename(file_path)}")
|
||||||
return file_path
|
return file_path
|
||||||
|
|
||||||
except requests.exceptions.RequestException as e:
|
except Exception as e:
|
||||||
logging.error(f"[TCADP] Failed to download file: {e}")
|
logging.error(f"[TCADP] Failed to download file: {e}")
|
||||||
|
try:
|
||||||
|
if "file_path" in locals() and os.path.exists(file_path):
|
||||||
|
os.unlink(file_path)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
class TCADPParser(RAGFlowPdfParser):
|
class TCADPParser(RAGFlowPdfParser):
|
||||||
def __init__(self, secret_id: str = None, secret_key: str = None, region: str = "ap-guangzhou",
|
def __init__(self, secret_id: str = None, secret_key: str = None, region: str = "ap-guangzhou",
|
||||||
table_result_type: str = None, markdown_image_response_type: str = None):
|
table_result_type: str = None, markdown_image_response_type: str = None):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
|
|
||||||
# First initialize logger
|
# First initialize logger
|
||||||
self.logger = logging.getLogger(self.__class__.__name__)
|
self.logger = logging.getLogger(self.__class__.__name__)
|
||||||
|
|
||||||
# Log received parameters
|
# Log received parameters
|
||||||
self.logger.info(f"[TCADP] Initializing with parameters - table_result_type: {table_result_type}, markdown_image_response_type: {markdown_image_response_type}")
|
self.logger.info(f"[TCADP] Initializing with parameters - table_result_type: {table_result_type}, markdown_image_response_type: {markdown_image_response_type}")
|
||||||
|
|
||||||
# Priority: read configuration from RAGFlow configuration system (service_conf.yaml)
|
# Priority: read configuration from RAGFlow configuration system (service_conf.yaml)
|
||||||
try:
|
try:
|
||||||
tcadp_parser = get_base_config("tcadp_config", {})
|
tcadp_parser = get_base_config("tcadp_config", {})
|
||||||
@ -212,7 +217,7 @@ class TCADPParser(RAGFlowPdfParser):
|
|||||||
# Set table_result_type and markdown_image_response_type from config or parameters
|
# Set table_result_type and markdown_image_response_type from config or parameters
|
||||||
self.table_result_type = table_result_type if table_result_type is not None else tcadp_parser.get("table_result_type", "1")
|
self.table_result_type = table_result_type if table_result_type is not None else tcadp_parser.get("table_result_type", "1")
|
||||||
self.markdown_image_response_type = markdown_image_response_type if markdown_image_response_type is not None else tcadp_parser.get("markdown_image_response_type", "1")
|
self.markdown_image_response_type = markdown_image_response_type if markdown_image_response_type is not None else tcadp_parser.get("markdown_image_response_type", "1")
|
||||||
|
|
||||||
else:
|
else:
|
||||||
self.logger.error("[TCADP] Please configure tcadp_config in service_conf.yaml first")
|
self.logger.error("[TCADP] Please configure tcadp_config in service_conf.yaml first")
|
||||||
# If config file is empty, use provided parameters or defaults
|
# If config file is empty, use provided parameters or defaults
|
||||||
@ -237,6 +242,10 @@ class TCADPParser(RAGFlowPdfParser):
|
|||||||
if not self.secret_id or not self.secret_key:
|
if not self.secret_id or not self.secret_key:
|
||||||
raise ValueError("[TCADP] Please set Tencent Cloud API keys, configure tcadp_config in service_conf.yaml")
|
raise ValueError("[TCADP] Please set Tencent Cloud API keys, configure tcadp_config in service_conf.yaml")
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _is_zipinfo_symlink(member: zipfile.ZipInfo) -> bool:
|
||||||
|
return (member.external_attr >> 16) & 0o170000 == 0o120000
|
||||||
|
|
||||||
def check_installation(self) -> bool:
|
def check_installation(self) -> bool:
|
||||||
"""Check if Tencent Cloud API configuration is correct"""
|
"""Check if Tencent Cloud API configuration is correct"""
|
||||||
try:
|
try:
|
||||||
@ -255,7 +264,7 @@ class TCADPParser(RAGFlowPdfParser):
|
|||||||
|
|
||||||
def _file_to_base64(self, file_path: str, binary: bytes = None) -> str:
|
def _file_to_base64(self, file_path: str, binary: bytes = None) -> str:
|
||||||
"""Convert file to Base64 format"""
|
"""Convert file to Base64 format"""
|
||||||
|
|
||||||
if binary:
|
if binary:
|
||||||
# If binary data is directly available, convert directly
|
# If binary data is directly available, convert directly
|
||||||
return base64.b64encode(binary).decode('utf-8')
|
return base64.b64encode(binary).decode('utf-8')
|
||||||
@ -271,23 +280,34 @@ class TCADPParser(RAGFlowPdfParser):
|
|||||||
|
|
||||||
try:
|
try:
|
||||||
with zipfile.ZipFile(zip_path, "r") as zip_file:
|
with zipfile.ZipFile(zip_path, "r") as zip_file:
|
||||||
# Find JSON result files
|
members = zip_file.infolist()
|
||||||
json_files = [f for f in zip_file.namelist() if f.endswith(".json")]
|
for member in members:
|
||||||
|
name = member.filename.replace("\\", "/")
|
||||||
|
if member.is_dir():
|
||||||
|
continue
|
||||||
|
if member.flag_bits & 0x1:
|
||||||
|
raise RuntimeError(f"[TCADP] Encrypted zip entry not supported: {member.filename}")
|
||||||
|
if self._is_zipinfo_symlink(member):
|
||||||
|
raise RuntimeError(f"[TCADP] Symlink zip entry not supported: {member.filename}")
|
||||||
|
if name.startswith("/") or name.startswith("//") or re.match(r"^[A-Za-z]:", name):
|
||||||
|
raise RuntimeError(f"[TCADP] Unsafe zip path (absolute): {member.filename}")
|
||||||
|
parts = [p for p in name.split("/") if p not in ("", ".")]
|
||||||
|
if any(p == ".." for p in parts):
|
||||||
|
raise RuntimeError(f"[TCADP] Unsafe zip path (traversal): {member.filename}")
|
||||||
|
|
||||||
for json_file in json_files:
|
if not (name.endswith(".json") or name.endswith(".md")):
|
||||||
with zip_file.open(json_file) as f:
|
continue
|
||||||
data = json.load(f)
|
|
||||||
if isinstance(data, list):
|
with zip_file.open(member) as f:
|
||||||
results.extend(data)
|
if name.endswith(".json"):
|
||||||
|
data = json.load(f)
|
||||||
|
if isinstance(data, list):
|
||||||
|
results.extend(data)
|
||||||
|
else:
|
||||||
|
results.append(data)
|
||||||
else:
|
else:
|
||||||
results.append(data)
|
content = f.read().decode("utf-8")
|
||||||
|
results.append({"type": "text", "content": content, "file": name})
|
||||||
# Find Markdown files
|
|
||||||
md_files = [f for f in zip_file.namelist() if f.endswith(".md")]
|
|
||||||
for md_file in md_files:
|
|
||||||
with zip_file.open(md_file) as f:
|
|
||||||
content = f.read().decode("utf-8")
|
|
||||||
results.append({"type": "text", "content": content, "file": md_file})
|
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
self.logger.error(f"[TCADP] Failed to extract ZIP file content: {e}")
|
self.logger.error(f"[TCADP] Failed to extract ZIP file content: {e}")
|
||||||
@ -395,7 +415,7 @@ class TCADPParser(RAGFlowPdfParser):
|
|||||||
# Convert file to Base64 format
|
# Convert file to Base64 format
|
||||||
if callback:
|
if callback:
|
||||||
callback(0.2, "[TCADP] Converting file to Base64 format")
|
callback(0.2, "[TCADP] Converting file to Base64 format")
|
||||||
|
|
||||||
file_base64 = self._file_to_base64(file_path, binary)
|
file_base64 = self._file_to_base64(file_path, binary)
|
||||||
if callback:
|
if callback:
|
||||||
callback(0.25, f"[TCADP] File converted to Base64, size: {len(file_base64)} characters")
|
callback(0.25, f"[TCADP] File converted to Base64, size: {len(file_base64)} characters")
|
||||||
@ -420,23 +440,23 @@ class TCADPParser(RAGFlowPdfParser):
|
|||||||
"TableResultType": self.table_result_type,
|
"TableResultType": self.table_result_type,
|
||||||
"MarkdownImageResponseType": self.markdown_image_response_type
|
"MarkdownImageResponseType": self.markdown_image_response_type
|
||||||
}
|
}
|
||||||
|
|
||||||
self.logger.info(f"[TCADP] API request config - TableResultType: {self.table_result_type}, MarkdownImageResponseType: {self.markdown_image_response_type}")
|
self.logger.info(f"[TCADP] API request config - TableResultType: {self.table_result_type}, MarkdownImageResponseType: {self.markdown_image_response_type}")
|
||||||
|
|
||||||
result = client.reconstruct_document_sse(
|
result = client.reconstruct_document_sse(
|
||||||
file_type=file_type,
|
file_type=file_type,
|
||||||
file_base64=file_base64,
|
file_base64=file_base64,
|
||||||
file_start_page=file_start_page,
|
file_start_page=file_start_page,
|
||||||
file_end_page=file_end_page,
|
file_end_page=file_end_page,
|
||||||
config=config
|
config=config
|
||||||
)
|
)
|
||||||
|
|
||||||
if result:
|
if result:
|
||||||
self.logger.info(f"[TCADP] Attempt {attempt + 1} successful")
|
self.logger.info(f"[TCADP] Attempt {attempt + 1} successful")
|
||||||
break
|
break
|
||||||
else:
|
else:
|
||||||
self.logger.warning(f"[TCADP] Attempt {attempt + 1} failed, result is None")
|
self.logger.warning(f"[TCADP] Attempt {attempt + 1} failed, result is None")
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
self.logger.error(f"[TCADP] Attempt {attempt + 1} exception: {e}")
|
self.logger.error(f"[TCADP] Attempt {attempt + 1} exception: {e}")
|
||||||
if attempt == max_retries - 1:
|
if attempt == max_retries - 1:
|
||||||
|
|||||||
Reference in New Issue
Block a user