Fix: Remove pdf embed support, update based on #10635 (#10663)

### What problem does this PR solve?

Fix: Remove pdf embed support, update based on  #10635

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
This commit is contained in:
Billy Bao
2025-10-20 13:45:53 +08:00
committed by GitHub
parent 5fc59a3132
commit d956a442ce
3 changed files with 20 additions and 74 deletions

View File

@ -35,7 +35,6 @@ from typing import List, Union, Tuple
# Third-party imports # Third-party imports
import olefile import olefile
import fitz
import pdfplumber import pdfplumber
from cachetools import LRUCache, cached from cachetools import LRUCache, cached
from PIL import Image from PIL import Image
@ -299,6 +298,7 @@ def read_potential_broken_pdf(blob):
return blob return blob
def _is_zip(h: bytes) -> bool: def _is_zip(h: bytes) -> bool:
return h.startswith(b"PK\x03\x04") or h.startswith(b"PK\x05\x06") or h.startswith(b"PK\x07\x08") return h.startswith(b"PK\x03\x04") or h.startswith(b"PK\x05\x06") or h.startswith(b"PK\x07\x08")
@ -340,9 +340,11 @@ def _extract_ole10native_payload(data: bytes) -> bytes:
return data return data
_ = int.from_bytes(data[pos:pos+4], "little") _ = int.from_bytes(data[pos:pos+4], "little")
pos += 4 pos += 4
for _ in range(3): # filename/src/tmp (NUL-terminated ANSI) # filename/src/tmp (NUL-terminated ANSI)
for _ in range(3):
z = data.index(b"\x00", pos) z = data.index(b"\x00", pos)
pos = z + 1 pos = z + 1
# skip unknown 4 bytes
pos += 4 pos += 4
if pos + 4 > len(data): if pos + 4 > len(data):
return data return data
@ -356,11 +358,9 @@ def _extract_ole10native_payload(data: bytes) -> bytes:
def extract_embed_file(target: Union[bytes, bytearray]) -> List[Tuple[str, bytes]]: def extract_embed_file(target: Union[bytes, bytearray]) -> List[Tuple[str, bytes]]:
""" """
Only extract the "first layer" of embedding, returning raw (filename, bytes). Only extract the 'first layer' of embedding, returning raw (filename, bytes).
These bytes can be directly used for io.BytesIO and then passed to zipfile/fitz/olefile and other libraries for further parsing.
""" """
top = bytes(target) top = bytes(target)
head = top[:8] head = top[:8]
out: List[Tuple[str, bytes]] = [] out: List[Tuple[str, bytes]] = []
seen = set() seen = set()
@ -371,7 +371,7 @@ def extract_embed_file(target: Union[bytes, bytearray]) -> List[Tuple[str, bytes
return return
seen.add(h10) seen.add(h10)
ext = _guess_ext(b) ext = _guess_ext(b)
# If name_hint does not have a clear extension, use the extension guessed from the content # If name_hint has an extension use its basename; else fallback to guessed ext
if "." in name_hint: if "." in name_hint:
fname = name_hint.split("/")[-1] fname = name_hint.split("/")[-1]
else: else:
@ -382,8 +382,10 @@ def extract_embed_file(target: Union[bytes, bytearray]) -> List[Tuple[str, bytes
if _is_zip(head): if _is_zip(head):
try: try:
with zipfile.ZipFile(io.BytesIO(top), "r") as z: with zipfile.ZipFile(io.BytesIO(top), "r") as z:
embed_dirs = ("word/embeddings/", "word/objects/", "word/activex/", embed_dirs = (
"xl/embeddings/", "ppt/embeddings/") "word/embeddings/", "word/objects/", "word/activex/",
"xl/embeddings/", "ppt/embeddings/"
)
for name in z.namelist(): for name in z.namelist():
low = name.lower() low = name.lower()
if any(low.startswith(d) for d in embed_dirs): if any(low.startswith(d) for d in embed_dirs):
@ -396,24 +398,7 @@ def extract_embed_file(target: Union[bytes, bytearray]) -> List[Tuple[str, bytes
pass pass
return out return out
# PDF attachments # OLE container (doc/ppt/xls)
if _is_pdf(head):
try:
doc = fitz.open(stream=top, filetype="pdf")
count = getattr(doc, "embfile_count", 0)
for i in range(count):
try:
data = doc.embfile_get(i)
if data:
push(bytes(data), f"EmbeddedFiles[{i}]")
except Exception:
pass
doc.close()
except Exception:
pass
return out
# Legacy OLE (.doc/.xls/.ppt)
if _is_ole(head): if _is_ole(head):
try: try:
with olefile.OleFileIO(io.BytesIO(top)) as ole: with olefile.OleFileIO(io.BytesIO(top)) as ole:

View File

@ -137,7 +137,6 @@ dependencies = [
"mammoth>=1.11.0", "mammoth>=1.11.0",
"markdownify>=1.2.0", "markdownify>=1.2.0",
"captcha>=0.7.1", "captcha>=0.7.1",
"fitz>=0.0.1.dev2",
] ]
[project.optional-dependencies] [project.optional-dependencies]

38
uv.lock generated
View File

@ -1708,25 +1708,6 @@ wheels = [
{ url = "https://pypi.tuna.tsinghua.edu.cn/packages/ae/f0/48285f0262fe47103a4a45972ed2f9b93e4c80b8fd609fa98da78b2a5706/filelock-3.15.4-py3-none-any.whl", hash = "sha256:6ca1fffae96225dab4c6eaf1c4f4f28cd2568d3ec2a44e15a08520504de468e7", size = 16159, upload-time = "2024-06-22T15:59:12.695Z" }, { url = "https://pypi.tuna.tsinghua.edu.cn/packages/ae/f0/48285f0262fe47103a4a45972ed2f9b93e4c80b8fd609fa98da78b2a5706/filelock-3.15.4-py3-none-any.whl", hash = "sha256:6ca1fffae96225dab4c6eaf1c4f4f28cd2568d3ec2a44e15a08520504de468e7", size = 16159, upload-time = "2024-06-22T15:59:12.695Z" },
] ]
[[package]]
name = "fitz"
version = "0.0.1.dev2"
source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" }
dependencies = [
{ name = "configobj" },
{ name = "configparser" },
{ name = "httplib2" },
{ name = "nibabel" },
{ name = "nipype" },
{ name = "numpy" },
{ name = "pandas" },
{ name = "pyxnat" },
{ name = "scipy" },
]
wheels = [
{ url = "https://pypi.tuna.tsinghua.edu.cn/packages/7e/28/27f27d66eb82f24e6595deb26c0a875e62431878c416e38eac515023abb2/fitz-0.0.1.dev2-py2.py3-none-any.whl", hash = "sha256:3b75083d58068d9bd51695eb2f78c9c92094cd6c8dada839e93edcddf18c0c5c", size = 20003, upload-time = "2017-02-25T23:29:54.403Z" },
]
[[package]] [[package]]
name = "flagembedding" name = "flagembedding"
version = "1.2.10" version = "1.2.10"
@ -5237,21 +5218,6 @@ crypto = [
{ name = "cryptography" }, { name = "cryptography" },
] ]
[[package]]
name = "pymupdf"
version = "1.26.5"
source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" }
sdist = { url = "https://pypi.tuna.tsinghua.edu.cn/packages/8d/9a/e0a4e92a85fc17be7c54afdbb113f0ade2a8bca49856d510e28bd249e462/pymupdf-1.26.5.tar.gz", hash = "sha256:8ef335e07f648492df240f2247854d0e7c0467afb9c4dc2376ec30978ec158c3", size = 84319274, upload-time = "2025-10-10T14:04:51.826Z" }
wheels = [
{ url = "https://pypi.tuna.tsinghua.edu.cn/packages/dd/3f/7fc927fd66922ce838d4c974ff9a685c5f5aba108a5d94914dc05c9371f5/pymupdf-1.26.5-cp39-abi3-macosx_10_9_x86_64.whl", hash = "sha256:2bfb58f07ad631e5f71ad0bd6f1ff52700f7ba7ebb4973130e81e75b721beae1", size = 23065601, upload-time = "2025-10-10T13:58:43.98Z" },
{ url = "https://pypi.tuna.tsinghua.edu.cn/packages/c1/e2/e87e62284ba98d59f1fd4fc7542ef2ed0002525754a485fa4077b3bbddae/pymupdf-1.26.5-cp39-abi3-macosx_11_0_arm64.whl", hash = "sha256:d58599479bc471d3ae56c3d68d9160d0b7de8a3bd40221ddc3a4eaae2d281b86", size = 22412612, upload-time = "2025-10-10T13:59:04.846Z" },
{ url = "https://pypi.tuna.tsinghua.edu.cn/packages/df/c2/af93c6367f79e9b5435f803bde51c1dc8225f054f8238162dda80b44986d/pymupdf-1.26.5-cp39-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:7dfea81fdd73437a6a6ce83e1fcf556faee9327a6540571e58bf04fa362bb0cd", size = 23457410, upload-time = "2025-10-10T22:45:26.355Z" },
{ url = "https://pypi.tuna.tsinghua.edu.cn/packages/5b/5a/1292a0df4ff71fbc00dfa8c08759d17c97e1e8ea9277eb5bc5f079ca188d/pymupdf-1.26.5-cp39-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:caad0ffeb63dcc4a29ca40f3c68d7b78d32a932e834b0056b529cc0bdbaaffc9", size = 24064941, upload-time = "2025-10-10T13:59:48.544Z" },
{ url = "https://pypi.tuna.tsinghua.edu.cn/packages/28/90/87b7fdfc9cd6991a3eb69a5752f6343374c34f258c511c242f4d60791eea/pymupdf-1.26.5-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:e24e7a7d696bd398543cc5c147869edb2026d5d5a21b7f8e35db2f20170b389e", size = 24268203, upload-time = "2025-10-10T14:00:28.791Z" },
{ url = "https://pypi.tuna.tsinghua.edu.cn/packages/2c/99/9d4b36485538e29df0a013fb02bbf6b5b0743a428fa07515e36631c43363/pymupdf-1.26.5-cp39-abi3-win32.whl", hash = "sha256:a2a42f5911d153a47bf5c3e162a0bfe8745eb9bec3e59fbaf87617b4003d8270", size = 17130722, upload-time = "2025-10-10T14:00:51.377Z" },
{ url = "https://pypi.tuna.tsinghua.edu.cn/packages/c6/96/fd59c1532891762ea4815e73956c532053d5e26d56969e1e5d1e4ca4b207/pymupdf-1.26.5-cp39-abi3-win_amd64.whl", hash = "sha256:39a6fb58182b27b51ea8150a0cd2e4ee7e0cf71e9d6723978f28699b42ee61ae", size = 18747258, upload-time = "2025-10-10T14:01:37.346Z" },
]
[[package]] [[package]]
name = "pymysql" name = "pymysql"
version = "1.1.1" version = "1.1.1"
@ -5706,7 +5672,6 @@ dependencies = [
{ name = "elasticsearch-dsl" }, { name = "elasticsearch-dsl" },
{ name = "extract-msg" }, { name = "extract-msg" },
{ name = "filelock" }, { name = "filelock" },
{ name = "fitz" },
{ name = "flasgger" }, { name = "flasgger" },
{ name = "flask" }, { name = "flask" },
{ name = "flask-cors" }, { name = "flask-cors" },
@ -5759,7 +5724,6 @@ dependencies = [
{ name = "pyclipper" }, { name = "pyclipper" },
{ name = "pycryptodomex" }, { name = "pycryptodomex" },
{ name = "pyicu" }, { name = "pyicu" },
{ name = "pymupdf" },
{ name = "pymysql" }, { name = "pymysql" },
{ name = "pyodbc" }, { name = "pyodbc" },
{ name = "pypdf" }, { name = "pypdf" },
@ -5868,7 +5832,6 @@ requires-dist = [
{ name = "fastembed", marker = "(platform_machine != 'x86_64' and extra == 'full') or (sys_platform == 'darwin' and extra == 'full')", specifier = ">=0.3.6,<0.4.0" }, { name = "fastembed", marker = "(platform_machine != 'x86_64' and extra == 'full') or (sys_platform == 'darwin' and extra == 'full')", specifier = ">=0.3.6,<0.4.0" },
{ name = "fastembed-gpu", marker = "platform_machine == 'x86_64' and sys_platform != 'darwin' and extra == 'full'", specifier = ">=0.3.6,<0.4.0" }, { name = "fastembed-gpu", marker = "platform_machine == 'x86_64' and sys_platform != 'darwin' and extra == 'full'", specifier = ">=0.3.6,<0.4.0" },
{ name = "filelock", specifier = "==3.15.4" }, { name = "filelock", specifier = "==3.15.4" },
{ name = "fitz", specifier = ">=0.0.1.dev2" },
{ name = "flagembedding", marker = "extra == 'full'", specifier = "==1.2.10" }, { name = "flagembedding", marker = "extra == 'full'", specifier = "==1.2.10" },
{ name = "flasgger", specifier = ">=0.9.7.1,<0.10.0" }, { name = "flasgger", specifier = ">=0.9.7.1,<0.10.0" },
{ name = "flask", specifier = "==3.0.3" }, { name = "flask", specifier = "==3.0.3" },
@ -5922,7 +5885,6 @@ requires-dist = [
{ name = "pyclipper", specifier = "==1.3.0.post5" }, { name = "pyclipper", specifier = "==1.3.0.post5" },
{ name = "pycryptodomex", specifier = "==3.20.0" }, { name = "pycryptodomex", specifier = "==3.20.0" },
{ name = "pyicu", specifier = ">=2.15.3,<3.0.0" }, { name = "pyicu", specifier = ">=2.15.3,<3.0.0" },
{ name = "pymupdf", specifier = ">=1.26.5" },
{ name = "pymysql", specifier = ">=1.1.1,<2.0.0" }, { name = "pymysql", specifier = ">=1.1.1,<2.0.0" },
{ name = "pyodbc", specifier = ">=5.2.0,<6.0.0" }, { name = "pyodbc", specifier = ">=5.2.0,<6.0.0" },
{ name = "pypdf", specifier = "==6.0.0" }, { name = "pypdf", specifier = "==6.0.0" },