Fix: zip extraction vulnerabilities in MinerU and TCADP (#12527)

### What problem does this PR solve?

Fix zip extraction vulnerabilities:
   - Block symlink entries in zip files.
   - Reject encrypted zip entries.
   - Prevent absolute path attacks (including Windows paths).
   - Block path traversal attempts (../).
   - Stop zip slip exploits (directory escape).
   - Use streaming for memory-safe file handling.

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
This commit is contained in:
Yongteng Lei
2026-01-13 12:24:50 +08:00
committed by GitHub
parent 41c84fd78f
commit 64c75d558e
2 changed files with 130 additions and 83 deletions

View File

@ -17,6 +17,7 @@ import json
import logging import logging
import os import os
import re import re
import shutil
import sys import sys
import tempfile import tempfile
import threading import threading
@ -138,39 +139,58 @@ class MinerUParser(RAGFlowPdfParser):
self.outlines = [] self.outlines = []
self.logger = logging.getLogger(self.__class__.__name__) self.logger = logging.getLogger(self.__class__.__name__)
@staticmethod
def _is_zipinfo_symlink(member: zipfile.ZipInfo) -> bool:
return (member.external_attr >> 16) & 0o170000 == 0o120000
def _extract_zip_no_root(self, zip_path, extract_to, root_dir): def _extract_zip_no_root(self, zip_path, extract_to, root_dir):
self.logger.info(f"[MinerU] Extract zip: zip_path={zip_path}, extract_to={extract_to}, root_hint={root_dir}") self.logger.info(f"[MinerU] Extract zip: zip_path={zip_path}, extract_to={extract_to}, root_hint={root_dir}")
base_dir = Path(extract_to).resolve()
with zipfile.ZipFile(zip_path, "r") as zip_ref: with zipfile.ZipFile(zip_path, "r") as zip_ref:
members = zip_ref.infolist()
if not root_dir: if not root_dir:
files = zip_ref.namelist() if members and members[0].filename.endswith("/"):
if files and files[0].endswith("/"): root_dir = members[0].filename
root_dir = files[0]
else: else:
root_dir = None root_dir = None
if root_dir:
root_dir = root_dir.replace("\\", "/")
if not root_dir.endswith("/"):
root_dir += "/"
if not root_dir or not root_dir.endswith("/"): for member in members:
self.logger.info(f"[MinerU] No root directory found, extracting all (root_hint={root_dir})") if member.flag_bits & 0x1:
zip_ref.extractall(extract_to) raise RuntimeError(f"[MinerU] Encrypted zip entry not supported: {member.filename}")
return if self._is_zipinfo_symlink(member):
raise RuntimeError(f"[MinerU] Symlink zip entry not supported: {member.filename}")
root_len = len(root_dir) name = member.filename.replace("\\", "/")
for member in zip_ref.infolist(): if root_dir and name == root_dir:
filename = member.filename
if filename == root_dir:
self.logger.info("[MinerU] Ignore root folder...") self.logger.info("[MinerU] Ignore root folder...")
continue continue
if root_dir and name.startswith(root_dir):
name = name[len(root_dir) :]
if not name:
continue
if name.startswith("/") or name.startswith("//") or re.match(r"^[A-Za-z]:", name):
raise RuntimeError(f"[MinerU] Unsafe zip path (absolute): {member.filename}")
path = filename parts = [p for p in name.split("/") if p not in ("", ".")]
if path.startswith(root_dir): if any(p == ".." for p in parts):
path = path[root_len:] raise RuntimeError(f"[MinerU] Unsafe zip path (traversal): {member.filename}")
rel_path = os.path.join(*parts) if parts else ""
dest_path = (Path(extract_to) / rel_path).resolve(strict=False)
if dest_path != base_dir and base_dir not in dest_path.parents:
raise RuntimeError(f"[MinerU] Unsafe zip path (escape): {member.filename}")
full_path = os.path.join(extract_to, path)
if member.is_dir(): if member.is_dir():
os.makedirs(full_path, exist_ok=True) os.makedirs(dest_path, exist_ok=True)
else: continue
os.makedirs(os.path.dirname(full_path), exist_ok=True)
with open(full_path, "wb") as f: os.makedirs(dest_path.parent, exist_ok=True)
f.write(zip_ref.read(filename)) with zip_ref.open(member) as src, open(dest_path, "wb") as dst:
shutil.copyfileobj(src, dst)
@staticmethod @staticmethod
def _is_http_endpoint_valid(url, timeout=5): def _is_http_endpoint_valid(url, timeout=5):
@ -237,8 +257,6 @@ class MinerUParser(RAGFlowPdfParser):
output_path = tempfile.mkdtemp(prefix=f"{pdf_file_name}_{options.method}_", dir=str(output_dir)) output_path = tempfile.mkdtemp(prefix=f"{pdf_file_name}_{options.method}_", dir=str(output_dir))
output_zip_path = os.path.join(str(output_dir), f"{Path(output_path).name}.zip") output_zip_path = os.path.join(str(output_dir), f"{Path(output_path).name}.zip")
files = {"files": (pdf_file_name + ".pdf", open(pdf_file_path, "rb"), "application/pdf")}
data = { data = {
"output_dir": "./output", "output_dir": "./output",
"lang_list": options.lang, "lang_list": options.lang,
@ -270,26 +288,35 @@ class MinerUParser(RAGFlowPdfParser):
self.logger.info(f"[MinerU] invoke api: {self.mineru_api}/file_parse backend={options.backend} server_url={data.get('server_url')}") self.logger.info(f"[MinerU] invoke api: {self.mineru_api}/file_parse backend={options.backend} server_url={data.get('server_url')}")
if callback: if callback:
callback(0.20, f"[MinerU] invoke api: {self.mineru_api}/file_parse") callback(0.20, f"[MinerU] invoke api: {self.mineru_api}/file_parse")
response = requests.post(url=f"{self.mineru_api}/file_parse", files=files, data=data, headers=headers, with open(pdf_file_path, "rb") as pdf_file:
timeout=1800) files = {"files": (pdf_file_name + ".pdf", pdf_file, "application/pdf")}
with requests.post(
url=f"{self.mineru_api}/file_parse",
files=files,
data=data,
headers=headers,
timeout=1800,
stream=True,
) as response:
response.raise_for_status()
content_type = response.headers.get("Content-Type", "")
if content_type.startswith("application/zip"):
self.logger.info(f"[MinerU] zip file returned, saving to {output_zip_path}...")
response.raise_for_status() if callback:
if response.headers.get("Content-Type") == "application/zip": callback(0.30, f"[MinerU] zip file returned, saving to {output_zip_path}...")
self.logger.info(f"[MinerU] zip file returned, saving to {output_zip_path}...")
if callback: with open(output_zip_path, "wb") as f:
callback(0.30, f"[MinerU] zip file returned, saving to {output_zip_path}...") response.raw.decode_content = True
shutil.copyfileobj(response.raw, f)
with open(output_zip_path, "wb") as f: self.logger.info(f"[MinerU] Unzip to {output_path}...")
f.write(response.content) self._extract_zip_no_root(output_zip_path, output_path, pdf_file_name + "/")
self.logger.info(f"[MinerU] Unzip to {output_path}...") if callback:
self._extract_zip_no_root(output_zip_path, output_path, pdf_file_name + "/") callback(0.40, f"[MinerU] Unzip to {output_path}...")
else:
if callback: self.logger.warning(f"[MinerU] not zip returned from api: {content_type}")
callback(0.40, f"[MinerU] Unzip to {output_path}...")
else:
self.logger.warning(f"[MinerU] not zip returned from api: {response.headers.get('Content-Type')}")
except Exception as e: except Exception as e:
raise RuntimeError(f"[MinerU] api failed with exception {e}") raise RuntimeError(f"[MinerU] api failed with exception {e}")
self.logger.info("[MinerU] Api completed successfully.") self.logger.info("[MinerU] Api completed successfully.")

View File

@ -17,6 +17,7 @@ import base64
import json import json
import logging import logging
import os import os
import re
import shutil import shutil
import tempfile import tempfile
import time import time
@ -168,9 +169,6 @@ class TencentCloudAPIClient:
return None return None
try: try:
response = requests.get(download_url)
response.raise_for_status()
# Ensure output directory exists # Ensure output directory exists
os.makedirs(output_dir, exist_ok=True) os.makedirs(output_dir, exist_ok=True)
@ -179,15 +177,22 @@ class TencentCloudAPIClient:
filename = f"tcadp_result_{timestamp}.zip" filename = f"tcadp_result_{timestamp}.zip"
file_path = os.path.join(output_dir, filename) file_path = os.path.join(output_dir, filename)
# Save file with requests.get(download_url, stream=True) as response:
with open(file_path, "wb") as f: response.raise_for_status()
f.write(response.content) with open(file_path, "wb") as f:
response.raw.decode_content = True
shutil.copyfileobj(response.raw, f)
logging.info(f"[TCADP] Document parsing result downloaded to: {os.path.basename(file_path)}") logging.info(f"[TCADP] Document parsing result downloaded to: {os.path.basename(file_path)}")
return file_path return file_path
except requests.exceptions.RequestException as e: except Exception as e:
logging.error(f"[TCADP] Failed to download file: {e}") logging.error(f"[TCADP] Failed to download file: {e}")
try:
if "file_path" in locals() and os.path.exists(file_path):
os.unlink(file_path)
except Exception:
pass
return None return None
@ -237,6 +242,10 @@ class TCADPParser(RAGFlowPdfParser):
if not self.secret_id or not self.secret_key: if not self.secret_id or not self.secret_key:
raise ValueError("[TCADP] Please set Tencent Cloud API keys, configure tcadp_config in service_conf.yaml") raise ValueError("[TCADP] Please set Tencent Cloud API keys, configure tcadp_config in service_conf.yaml")
@staticmethod
def _is_zipinfo_symlink(member: zipfile.ZipInfo) -> bool:
return (member.external_attr >> 16) & 0o170000 == 0o120000
def check_installation(self) -> bool: def check_installation(self) -> bool:
"""Check if Tencent Cloud API configuration is correct""" """Check if Tencent Cloud API configuration is correct"""
try: try:
@ -271,23 +280,34 @@ class TCADPParser(RAGFlowPdfParser):
try: try:
with zipfile.ZipFile(zip_path, "r") as zip_file: with zipfile.ZipFile(zip_path, "r") as zip_file:
# Find JSON result files members = zip_file.infolist()
json_files = [f for f in zip_file.namelist() if f.endswith(".json")] for member in members:
name = member.filename.replace("\\", "/")
if member.is_dir():
continue
if member.flag_bits & 0x1:
raise RuntimeError(f"[TCADP] Encrypted zip entry not supported: {member.filename}")
if self._is_zipinfo_symlink(member):
raise RuntimeError(f"[TCADP] Symlink zip entry not supported: {member.filename}")
if name.startswith("/") or name.startswith("//") or re.match(r"^[A-Za-z]:", name):
raise RuntimeError(f"[TCADP] Unsafe zip path (absolute): {member.filename}")
parts = [p for p in name.split("/") if p not in ("", ".")]
if any(p == ".." for p in parts):
raise RuntimeError(f"[TCADP] Unsafe zip path (traversal): {member.filename}")
for json_file in json_files: if not (name.endswith(".json") or name.endswith(".md")):
with zip_file.open(json_file) as f: continue
data = json.load(f)
if isinstance(data, list): with zip_file.open(member) as f:
results.extend(data) if name.endswith(".json"):
data = json.load(f)
if isinstance(data, list):
results.extend(data)
else:
results.append(data)
else: else:
results.append(data) content = f.read().decode("utf-8")
results.append({"type": "text", "content": content, "file": name})
# Find Markdown files
md_files = [f for f in zip_file.namelist() if f.endswith(".md")]
for md_file in md_files:
with zip_file.open(md_file) as f:
content = f.read().decode("utf-8")
results.append({"type": "text", "content": content, "file": md_file})
except Exception as e: except Exception as e:
self.logger.error(f"[TCADP] Failed to extract ZIP file content: {e}") self.logger.error(f"[TCADP] Failed to extract ZIP file content: {e}")