Fix: zip extraction vulnerabilities in MinerU and TCADP (#12527)

### What problem does this PR solve? Fix zip extraction vulnerabilities: - Block symlink entries in zip files. - Reject encrypted zip entries. - Prevent absolute path attacks (including Windows paths). - Block path traversal attempts (../). - Stop zip slip exploits (directory escape). - Use streaming for memory-safe file handling. ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue)
2026-01-30 23:26:36 +08:00 · 2026-01-13 12:24:50 +08:00
parent 41c84fd78f
commit 64c75d558e
2 changed files with 130 additions and 83 deletions
--- a/deepdoc/parser/tcadp_parser.py
+++ b/deepdoc/parser/tcadp_parser.py
@ -17,6 +17,7 @@ import base64
 import json
 import logging
 import os
+import re
 import shutil
 import tempfile
 import time
@ -48,10 +49,10 @@ class TencentCloudAPIClient:
        self.secret_key = secret_key
        self.region = region
        self.outlines = []
-        
+
        # Create credentials
        self.cred = credential.Credential(secret_id, secret_key)
-        
+
        # Instantiate an http option, optional, can be skipped if no special requirements
        self.httpProfile = HttpProfile()
        self.httpProfile.endpoint = "lkeap.tencentcloudapi.com"
@ -59,7 +60,7 @@ class TencentCloudAPIClient:
        # Instantiate a client option, optional, can be skipped if no special requirements
        self.clientProfile = ClientProfile()
        self.clientProfile.httpProfile = self.httpProfile
-        
+
        # Instantiate the client object for the product to be requested, clientProfile is optional
        self.client = lkeap_client.LkeapClient(self.cred, region, self.clientProfile)

@ -68,14 +69,14 @@ class TencentCloudAPIClient:
        try:
            # Instantiate a request object, each interface corresponds to a request object
            req = models.ReconstructDocumentSSERequest()
-            
+
            # Build request parameters
            params = {
                "FileType": file_type,
                "FileStartPageNumber": file_start_page,
                "FileEndPageNumber": file_end_page,
            }
-            
+
            # According to Tencent Cloud API documentation, either FileUrl or FileBase64 parameter must be provided, if both are provided only FileUrl will be used
            if file_url:
                params["FileUrl"] = file_url
@ -94,7 +95,7 @@ class TencentCloudAPIClient:
            # The returned resp is an instance of ReconstructDocumentSSEResponse, corresponding to the request object
            resp = self.client.ReconstructDocumentSSE(req)
            parser_result = {}
-            
+
            # Output json format string response
            if isinstance(resp, types.GeneratorType):  # Streaming response
                logging.info("[TCADP] Detected streaming response")
@ -104,7 +105,7 @@ class TencentCloudAPIClient:
                        try:
                            data_dict = json.loads(event['data'])
                            logging.info(f"[TCADP] Parsed data: {data_dict}")
-                            
+
                            if data_dict.get('Progress') == "100":
                                parser_result = data_dict
                                logging.info("[TCADP] Document parsing completed!")
@ -118,14 +119,14 @@ class TencentCloudAPIClient:
                                    logging.warning("[TCADP] Failed parsing pages:")
                                    for page in failed_pages:
                                        logging.warning(f"[TCADP]   Page number: {page.get('PageNumber')}, Error: {page.get('ErrorMsg')}")
-                                
+
                                # Check if there is a download link
                                download_url = data_dict.get("DocumentRecognizeResultUrl")
                                if download_url:
                                    logging.info(f"[TCADP] Got download link: {download_url}")
                                else:
                                    logging.warning("[TCADP] No download link obtained")
-                                
+
                                break  # Found final result, exit loop
                            else:
                                # Print progress information
@ -168,9 +169,6 @@ class TencentCloudAPIClient:
            return None

        try:
-            response = requests.get(download_url)
-            response.raise_for_status()
-
            # Ensure output directory exists
            os.makedirs(output_dir, exist_ok=True)

@ -179,29 +177,36 @@ class TencentCloudAPIClient:
            filename = f"tcadp_result_{timestamp}.zip"
            file_path = os.path.join(output_dir, filename)

-            # Save file
-            with open(file_path, "wb") as f:
-                f.write(response.content)
+            with requests.get(download_url, stream=True) as response:
+                response.raise_for_status()
+                with open(file_path, "wb") as f:
+                    response.raw.decode_content = True
+                    shutil.copyfileobj(response.raw, f)

            logging.info(f"[TCADP] Document parsing result downloaded to: {os.path.basename(file_path)}")
            return file_path

-        except requests.exceptions.RequestException as e:
+        except Exception as e:
            logging.error(f"[TCADP] Failed to download file: {e}")
+            try:
+                if "file_path" in locals() and os.path.exists(file_path):
+                    os.unlink(file_path)
+            except Exception:
+                pass
            return None


 class TCADPParser(RAGFlowPdfParser):
-    def __init__(self, secret_id: str = None, secret_key: str = None, region: str = "ap-guangzhou", 
+    def __init__(self, secret_id: str = None, secret_key: str = None, region: str = "ap-guangzhou",
                 table_result_type: str = None, markdown_image_response_type: str = None):
        super().__init__()
-        
+
        # First initialize logger
        self.logger = logging.getLogger(self.__class__.__name__)
-        
+
        # Log received parameters
        self.logger.info(f"[TCADP] Initializing with parameters - table_result_type: {table_result_type}, markdown_image_response_type: {markdown_image_response_type}")
-        
+
        # Priority: read configuration from RAGFlow configuration system (service_conf.yaml)
        try:
            tcadp_parser = get_base_config("tcadp_config", {})
@ -212,7 +217,7 @@ class TCADPParser(RAGFlowPdfParser):
                # Set table_result_type and markdown_image_response_type from config or parameters
                self.table_result_type = table_result_type if table_result_type is not None else tcadp_parser.get("table_result_type", "1")
                self.markdown_image_response_type = markdown_image_response_type if markdown_image_response_type is not None else tcadp_parser.get("markdown_image_response_type", "1")
-                
+
            else:
                self.logger.error("[TCADP] Please configure tcadp_config in service_conf.yaml first")
                # If config file is empty, use provided parameters or defaults
@ -237,6 +242,10 @@ class TCADPParser(RAGFlowPdfParser):
        if not self.secret_id or not self.secret_key:
            raise ValueError("[TCADP] Please set Tencent Cloud API keys, configure tcadp_config in service_conf.yaml")

+    @staticmethod
+    def _is_zipinfo_symlink(member: zipfile.ZipInfo) -> bool:
+        return (member.external_attr >> 16) & 0o170000 == 0o120000
+
    def check_installation(self) -> bool:
        """Check if Tencent Cloud API configuration is correct"""
        try:
@ -255,7 +264,7 @@ class TCADPParser(RAGFlowPdfParser):

    def _file_to_base64(self, file_path: str, binary: bytes = None) -> str:
        """Convert file to Base64 format"""
-        
+
        if binary:
            # If binary data is directly available, convert directly
            return base64.b64encode(binary).decode('utf-8')
@ -271,23 +280,34 @@ class TCADPParser(RAGFlowPdfParser):

        try:
            with zipfile.ZipFile(zip_path, "r") as zip_file:
-                # Find JSON result files
-                json_files = [f for f in zip_file.namelist() if f.endswith(".json")]
+                members = zip_file.infolist()
+                for member in members:
+                    name = member.filename.replace("\\", "/")
+                    if member.is_dir():
+                        continue
+                    if member.flag_bits & 0x1:
+                        raise RuntimeError(f"[TCADP] Encrypted zip entry not supported: {member.filename}")
+                    if self._is_zipinfo_symlink(member):
+                        raise RuntimeError(f"[TCADP] Symlink zip entry not supported: {member.filename}")
+                    if name.startswith("/") or name.startswith("//") or re.match(r"^[A-Za-z]:", name):
+                        raise RuntimeError(f"[TCADP] Unsafe zip path (absolute): {member.filename}")
+                    parts = [p for p in name.split("/") if p not in ("", ".")]
+                    if any(p == ".." for p in parts):
+                        raise RuntimeError(f"[TCADP] Unsafe zip path (traversal): {member.filename}")

-                for json_file in json_files:
-                    with zip_file.open(json_file) as f:
-                        data = json.load(f)
-                        if isinstance(data, list):
-                            results.extend(data)
+                    if not (name.endswith(".json") or name.endswith(".md")):
+                        continue
+
+                    with zip_file.open(member) as f:
+                        if name.endswith(".json"):
+                            data = json.load(f)
+                            if isinstance(data, list):
+                                results.extend(data)
+                            else:
+                                results.append(data)
                        else:
-                            results.append(data)
-
-                # Find Markdown files
-                md_files = [f for f in zip_file.namelist() if f.endswith(".md")]
-                for md_file in md_files:
-                    with zip_file.open(md_file) as f:
-                        content = f.read().decode("utf-8")
-                        results.append({"type": "text", "content": content, "file": md_file})
+                            content = f.read().decode("utf-8")
+                            results.append({"type": "text", "content": content, "file": name})

        except Exception as e:
            self.logger.error(f"[TCADP] Failed to extract ZIP file content: {e}")
@ -395,7 +415,7 @@ class TCADPParser(RAGFlowPdfParser):
            # Convert file to Base64 format
            if callback:
                callback(0.2, "[TCADP] Converting file to Base64 format")
-            
+
            file_base64 = self._file_to_base64(file_path, binary)
            if callback:
                callback(0.25, f"[TCADP] File converted to Base64, size: {len(file_base64)} characters")
@ -420,23 +440,23 @@ class TCADPParser(RAGFlowPdfParser):
                        "TableResultType": self.table_result_type,
                        "MarkdownImageResponseType": self.markdown_image_response_type
                    }
-                    
+
                    self.logger.info(f"[TCADP] API request config - TableResultType: {self.table_result_type}, MarkdownImageResponseType: {self.markdown_image_response_type}")

                    result = client.reconstruct_document_sse(
-                        file_type=file_type, 
-                        file_base64=file_base64, 
-                        file_start_page=file_start_page, 
-                        file_end_page=file_end_page, 
+                        file_type=file_type,
+                        file_base64=file_base64,
+                        file_start_page=file_start_page,
+                        file_end_page=file_end_page,
                        config=config
                    )
-                    
+
                    if result:
                        self.logger.info(f"[TCADP] Attempt {attempt + 1} successful")
                        break
                    else:
                        self.logger.warning(f"[TCADP] Attempt {attempt + 1} failed, result is None")
-                        
+
                except Exception as e:
                    self.logger.error(f"[TCADP] Attempt {attempt + 1} exception: {e}")
                    if attempt == max_retries - 1: