Fix: Remove pdf embed support, update based on #10635 (#10663)

### What problem does this PR solve?

Fix: Remove pdf embed support, update based on  #10635

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
This commit is contained in:
Billy Bao
2025-10-20 13:45:53 +08:00
committed by GitHub
parent 5fc59a3132
commit d956a442ce
3 changed files with 20 additions and 74 deletions

View File

@ -35,7 +35,6 @@ from typing import List, Union, Tuple
# Third-party imports
import olefile
import fitz
import pdfplumber
from cachetools import LRUCache, cached
from PIL import Image
@ -299,6 +298,7 @@ def read_potential_broken_pdf(blob):
return blob
def _is_zip(h: bytes) -> bool:
return h.startswith(b"PK\x03\x04") or h.startswith(b"PK\x05\x06") or h.startswith(b"PK\x07\x08")
@ -340,9 +340,11 @@ def _extract_ole10native_payload(data: bytes) -> bytes:
return data
_ = int.from_bytes(data[pos:pos+4], "little")
pos += 4
for _ in range(3): # filename/src/tmp (NUL-terminated ANSI)
# filename/src/tmp (NUL-terminated ANSI)
for _ in range(3):
z = data.index(b"\x00", pos)
pos = z + 1
# skip unknown 4 bytes
pos += 4
if pos + 4 > len(data):
return data
@ -356,11 +358,9 @@ def _extract_ole10native_payload(data: bytes) -> bytes:
def extract_embed_file(target: Union[bytes, bytearray]) -> List[Tuple[str, bytes]]:
"""
Only extract the "first layer" of embedding, returning raw (filename, bytes).
These bytes can be directly used for io.BytesIO and then passed to zipfile/fitz/olefile and other libraries for further parsing.
Only extract the 'first layer' of embedding, returning raw (filename, bytes).
"""
top = bytes(target)
head = top[:8]
out: List[Tuple[str, bytes]] = []
seen = set()
@ -371,7 +371,7 @@ def extract_embed_file(target: Union[bytes, bytearray]) -> List[Tuple[str, bytes
return
seen.add(h10)
ext = _guess_ext(b)
# If name_hint does not have a clear extension, use the extension guessed from the content
# If name_hint has an extension use its basename; else fallback to guessed ext
if "." in name_hint:
fname = name_hint.split("/")[-1]
else:
@ -382,8 +382,10 @@ def extract_embed_file(target: Union[bytes, bytearray]) -> List[Tuple[str, bytes
if _is_zip(head):
try:
with zipfile.ZipFile(io.BytesIO(top), "r") as z:
embed_dirs = ("word/embeddings/", "word/objects/", "word/activex/",
"xl/embeddings/", "ppt/embeddings/")
embed_dirs = (
"word/embeddings/", "word/objects/", "word/activex/",
"xl/embeddings/", "ppt/embeddings/"
)
for name in z.namelist():
low = name.lower()
if any(low.startswith(d) for d in embed_dirs):
@ -396,24 +398,7 @@ def extract_embed_file(target: Union[bytes, bytearray]) -> List[Tuple[str, bytes
pass
return out
# PDF attachments
if _is_pdf(head):
try:
doc = fitz.open(stream=top, filetype="pdf")
count = getattr(doc, "embfile_count", 0)
for i in range(count):
try:
data = doc.embfile_get(i)
if data:
push(bytes(data), f"EmbeddedFiles[{i}]")
except Exception:
pass
doc.close()
except Exception:
pass
return out
# Legacy OLE (.doc/.xls/.ppt)
# OLE container (doc/ppt/xls)
if _is_ole(head):
try:
with olefile.OleFileIO(io.BytesIO(top)) as ole:

View File

@ -137,7 +137,6 @@ dependencies = [
"mammoth>=1.11.0",
"markdownify>=1.2.0",
"captcha>=0.7.1",
"fitz>=0.0.1.dev2",
]
[project.optional-dependencies]

38
uv.lock generated
View File

@ -1708,25 +1708,6 @@ wheels = [
{ url = "https://pypi.tuna.tsinghua.edu.cn/packages/ae/f0/48285f0262fe47103a4a45972ed2f9b93e4c80b8fd609fa98da78b2a5706/filelock-3.15.4-py3-none-any.whl", hash = "sha256:6ca1fffae96225dab4c6eaf1c4f4f28cd2568d3ec2a44e15a08520504de468e7", size = 16159, upload-time = "2024-06-22T15:59:12.695Z" },
]
[[package]]
name = "fitz"
version = "0.0.1.dev2"
source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" }
dependencies = [
{ name = "configobj" },
{ name = "configparser" },
{ name = "httplib2" },
{ name = "nibabel" },
{ name = "nipype" },
{ name = "numpy" },
{ name = "pandas" },
{ name = "pyxnat" },
{ name = "scipy" },
]
wheels = [
{ url = "https://pypi.tuna.tsinghua.edu.cn/packages/7e/28/27f27d66eb82f24e6595deb26c0a875e62431878c416e38eac515023abb2/fitz-0.0.1.dev2-py2.py3-none-any.whl", hash = "sha256:3b75083d58068d9bd51695eb2f78c9c92094cd6c8dada839e93edcddf18c0c5c", size = 20003, upload-time = "2017-02-25T23:29:54.403Z" },
]
[[package]]
name = "flagembedding"
version = "1.2.10"
@ -5237,21 +5218,6 @@ crypto = [
{ name = "cryptography" },
]
[[package]]
name = "pymupdf"
version = "1.26.5"
source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" }
sdist = { url = "https://pypi.tuna.tsinghua.edu.cn/packages/8d/9a/e0a4e92a85fc17be7c54afdbb113f0ade2a8bca49856d510e28bd249e462/pymupdf-1.26.5.tar.gz", hash = "sha256:8ef335e07f648492df240f2247854d0e7c0467afb9c4dc2376ec30978ec158c3", size = 84319274, upload-time = "2025-10-10T14:04:51.826Z" }
wheels = [
{ url = "https://pypi.tuna.tsinghua.edu.cn/packages/dd/3f/7fc927fd66922ce838d4c974ff9a685c5f5aba108a5d94914dc05c9371f5/pymupdf-1.26.5-cp39-abi3-macosx_10_9_x86_64.whl", hash = "sha256:2bfb58f07ad631e5f71ad0bd6f1ff52700f7ba7ebb4973130e81e75b721beae1", size = 23065601, upload-time = "2025-10-10T13:58:43.98Z" },
{ url = "https://pypi.tuna.tsinghua.edu.cn/packages/c1/e2/e87e62284ba98d59f1fd4fc7542ef2ed0002525754a485fa4077b3bbddae/pymupdf-1.26.5-cp39-abi3-macosx_11_0_arm64.whl", hash = "sha256:d58599479bc471d3ae56c3d68d9160d0b7de8a3bd40221ddc3a4eaae2d281b86", size = 22412612, upload-time = "2025-10-10T13:59:04.846Z" },
{ url = "https://pypi.tuna.tsinghua.edu.cn/packages/df/c2/af93c6367f79e9b5435f803bde51c1dc8225f054f8238162dda80b44986d/pymupdf-1.26.5-cp39-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:7dfea81fdd73437a6a6ce83e1fcf556faee9327a6540571e58bf04fa362bb0cd", size = 23457410, upload-time = "2025-10-10T22:45:26.355Z" },
{ url = "https://pypi.tuna.tsinghua.edu.cn/packages/5b/5a/1292a0df4ff71fbc00dfa8c08759d17c97e1e8ea9277eb5bc5f079ca188d/pymupdf-1.26.5-cp39-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:caad0ffeb63dcc4a29ca40f3c68d7b78d32a932e834b0056b529cc0bdbaaffc9", size = 24064941, upload-time = "2025-10-10T13:59:48.544Z" },
{ url = "https://pypi.tuna.tsinghua.edu.cn/packages/28/90/87b7fdfc9cd6991a3eb69a5752f6343374c34f258c511c242f4d60791eea/pymupdf-1.26.5-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:e24e7a7d696bd398543cc5c147869edb2026d5d5a21b7f8e35db2f20170b389e", size = 24268203, upload-time = "2025-10-10T14:00:28.791Z" },
{ url = "https://pypi.tuna.tsinghua.edu.cn/packages/2c/99/9d4b36485538e29df0a013fb02bbf6b5b0743a428fa07515e36631c43363/pymupdf-1.26.5-cp39-abi3-win32.whl", hash = "sha256:a2a42f5911d153a47bf5c3e162a0bfe8745eb9bec3e59fbaf87617b4003d8270", size = 17130722, upload-time = "2025-10-10T14:00:51.377Z" },
{ url = "https://pypi.tuna.tsinghua.edu.cn/packages/c6/96/fd59c1532891762ea4815e73956c532053d5e26d56969e1e5d1e4ca4b207/pymupdf-1.26.5-cp39-abi3-win_amd64.whl", hash = "sha256:39a6fb58182b27b51ea8150a0cd2e4ee7e0cf71e9d6723978f28699b42ee61ae", size = 18747258, upload-time = "2025-10-10T14:01:37.346Z" },
]
[[package]]
name = "pymysql"
version = "1.1.1"
@ -5706,7 +5672,6 @@ dependencies = [
{ name = "elasticsearch-dsl" },
{ name = "extract-msg" },
{ name = "filelock" },
{ name = "fitz" },
{ name = "flasgger" },
{ name = "flask" },
{ name = "flask-cors" },
@ -5759,7 +5724,6 @@ dependencies = [
{ name = "pyclipper" },
{ name = "pycryptodomex" },
{ name = "pyicu" },
{ name = "pymupdf" },
{ name = "pymysql" },
{ name = "pyodbc" },
{ name = "pypdf" },
@ -5868,7 +5832,6 @@ requires-dist = [
{ name = "fastembed", marker = "(platform_machine != 'x86_64' and extra == 'full') or (sys_platform == 'darwin' and extra == 'full')", specifier = ">=0.3.6,<0.4.0" },
{ name = "fastembed-gpu", marker = "platform_machine == 'x86_64' and sys_platform != 'darwin' and extra == 'full'", specifier = ">=0.3.6,<0.4.0" },
{ name = "filelock", specifier = "==3.15.4" },
{ name = "fitz", specifier = ">=0.0.1.dev2" },
{ name = "flagembedding", marker = "extra == 'full'", specifier = "==1.2.10" },
{ name = "flasgger", specifier = ">=0.9.7.1,<0.10.0" },
{ name = "flask", specifier = "==3.0.3" },
@ -5922,7 +5885,6 @@ requires-dist = [
{ name = "pyclipper", specifier = "==1.3.0.post5" },
{ name = "pycryptodomex", specifier = "==3.20.0" },
{ name = "pyicu", specifier = ">=2.15.3,<3.0.0" },
{ name = "pymupdf", specifier = ">=1.26.5" },
{ name = "pymysql", specifier = ">=1.1.1,<2.0.0" },
{ name = "pyodbc", specifier = ">=5.2.0,<6.0.0" },
{ name = "pypdf", specifier = "==6.0.0" },