Fix: Remove pdf embed support, update based on #10635 (#10663)

### What problem does this PR solve?

Fix: Remove pdf embed support, update based on  #10635

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
This commit is contained in:
Billy Bao
2025-10-20 13:45:53 +08:00
committed by GitHub
parent 5fc59a3132
commit d956a442ce
3 changed files with 20 additions and 74 deletions

View File

@ -35,7 +35,6 @@ from typing import List, Union, Tuple
# Third-party imports
import olefile
import fitz
import pdfplumber
from cachetools import LRUCache, cached
from PIL import Image
@ -299,6 +298,7 @@ def read_potential_broken_pdf(blob):
return blob
def _is_zip(h: bytes) -> bool:
return h.startswith(b"PK\x03\x04") or h.startswith(b"PK\x05\x06") or h.startswith(b"PK\x07\x08")
@ -317,18 +317,18 @@ def _guess_ext(b: bytes) -> str:
try:
with zipfile.ZipFile(io.BytesIO(b), "r") as z:
names = [n.lower() for n in z.namelist()]
if any(n.startswith("word/") for n in names):
if any(n.startswith("word/") for n in names):
return ".docx"
if any(n.startswith("ppt/") for n in names):
if any(n.startswith("ppt/") for n in names):
return ".pptx"
if any(n.startswith("xl/") for n in names):
if any(n.startswith("xl/") for n in names):
return ".xlsx"
except Exception:
pass
return ".zip"
if _is_pdf(h):
if _is_pdf(h):
return ".pdf"
if _is_ole(h):
if _is_ole(h):
return ".doc"
return ".bin"
@ -336,19 +336,21 @@ def _guess_ext(b: bytes) -> str:
def _extract_ole10native_payload(data: bytes) -> bytes:
try:
pos = 0
if len(data) < 4:
if len(data) < 4:
return data
_ = int.from_bytes(data[pos:pos+4], "little")
pos += 4
for _ in range(3): # filename/src/tmp (NUL-terminated ANSI)
# filename/src/tmp (NUL-terminated ANSI)
for _ in range(3):
z = data.index(b"\x00", pos)
pos = z + 1
# skip unknown 4 bytes
pos += 4
if pos + 4 > len(data):
if pos + 4 > len(data):
return data
size = int.from_bytes(data[pos:pos+4], "little")
pos += 4
if pos + size <= len(data):
if pos + size <= len(data):
return data[pos:pos+size]
except Exception:
pass
@ -356,22 +358,20 @@ def _extract_ole10native_payload(data: bytes) -> bytes:
def extract_embed_file(target: Union[bytes, bytearray]) -> List[Tuple[str, bytes]]:
"""
Only extract the "first layer" of embedding, returning raw (filename, bytes).
These bytes can be directly used for io.BytesIO and then passed to zipfile/fitz/olefile and other libraries for further parsing.
Only extract the 'first layer' of embedding, returning raw (filename, bytes).
"""
top = bytes(target)
head = top[:8]
out: List[Tuple[str, bytes]] = []
seen = set()
def push(b: bytes, name_hint: str = ""):
h10 = _sha10(b)
if h10 in seen:
if h10 in seen:
return
seen.add(h10)
ext = _guess_ext(b)
# If name_hint does not have a clear extension, use the extension guessed from the content
# If name_hint has an extension use its basename; else fallback to guessed ext
if "." in name_hint:
fname = name_hint.split("/")[-1]
else:
@ -382,8 +382,10 @@ def extract_embed_file(target: Union[bytes, bytearray]) -> List[Tuple[str, bytes
if _is_zip(head):
try:
with zipfile.ZipFile(io.BytesIO(top), "r") as z:
embed_dirs = ("word/embeddings/", "word/objects/", "word/activex/",
"xl/embeddings/", "ppt/embeddings/")
embed_dirs = (
"word/embeddings/", "word/objects/", "word/activex/",
"xl/embeddings/", "ppt/embeddings/"
)
for name in z.namelist():
low = name.lower()
if any(low.startswith(d) for d in embed_dirs):
@ -396,24 +398,7 @@ def extract_embed_file(target: Union[bytes, bytearray]) -> List[Tuple[str, bytes
pass
return out
# PDF attachments
if _is_pdf(head):
try:
doc = fitz.open(stream=top, filetype="pdf")
count = getattr(doc, "embfile_count", 0)
for i in range(count):
try:
data = doc.embfile_get(i)
if data:
push(bytes(data), f"EmbeddedFiles[{i}]")
except Exception:
pass
doc.close()
except Exception:
pass
return out
# Legacy OLE (.doc/.xls/.ppt)
# OLE container (doc/ppt/xls)
if _is_ole(head):
try:
with olefile.OleFileIO(io.BytesIO(top)) as ole: