Fix: Remove pdf embed support, update based on #10635 (#10663)

### What problem does this PR solve? Fix: Remove pdf embed support, update based on #10635 ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue)
2026-01-29 22:56:36 +08:00 · 2025-10-20 13:45:53 +08:00
parent 5fc59a3132
commit d956a442ce
3 changed files with 20 additions and 74 deletions
--- a/api/utils/file_utils.py
+++ b/api/utils/file_utils.py
@ -35,7 +35,6 @@ from typing import List, Union, Tuple

 # Third-party imports
 import olefile
-import fitz
 import pdfplumber
 from cachetools import LRUCache, cached
 from PIL import Image
@ -299,6 +298,7 @@ def read_potential_broken_pdf(blob):
    return blob


+
 def _is_zip(h: bytes) -> bool:
    return h.startswith(b"PK\x03\x04") or h.startswith(b"PK\x05\x06") or h.startswith(b"PK\x07\x08")

@ -317,18 +317,18 @@ def _guess_ext(b: bytes) -> str:
        try:
            with zipfile.ZipFile(io.BytesIO(b), "r") as z:
                names = [n.lower() for n in z.namelist()]
-                if any(n.startswith("word/") for n in names): 
+                if any(n.startswith("word/") for n in names):
                    return ".docx"
-                if any(n.startswith("ppt/")  for n in names): 
+                if any(n.startswith("ppt/") for n in names):
                    return ".pptx"
-                if any(n.startswith("xl/")   for n in names): 
+                if any(n.startswith("xl/") for n in names):
                    return ".xlsx"
        except Exception:
            pass
        return ".zip"
-    if _is_pdf(h): 
+    if _is_pdf(h):
        return ".pdf"
-    if _is_ole(h): 
+    if _is_ole(h):
        return ".doc"
    return ".bin"

@ -336,19 +336,21 @@ def _guess_ext(b: bytes) -> str:
 def _extract_ole10native_payload(data: bytes) -> bytes:
    try:
        pos = 0
-        if len(data) < 4: 
+        if len(data) < 4:
            return data
        _ = int.from_bytes(data[pos:pos+4], "little")
        pos += 4
-        for _ in range(3):  # filename/src/tmp (NUL-terminated ANSI)
+        # filename/src/tmp (NUL-terminated ANSI)
+        for _ in range(3):
            z = data.index(b"\x00", pos)
            pos = z + 1
+        # skip unknown 4 bytes
        pos += 4
-        if pos + 4 > len(data): 
+        if pos + 4 > len(data):
            return data
        size = int.from_bytes(data[pos:pos+4], "little")
        pos += 4
-        if pos + size <= len(data): 
+        if pos + size <= len(data):
            return data[pos:pos+size]
    except Exception:
        pass
@ -356,22 +358,20 @@ def _extract_ole10native_payload(data: bytes) -> bytes:

 def extract_embed_file(target: Union[bytes, bytearray]) -> List[Tuple[str, bytes]]:
    """
-    Only extract the "first layer" of embedding, returning raw (filename, bytes).
-    These bytes can be directly used for io.BytesIO and then passed to zipfile/fitz/olefile and other libraries for further parsing.
+    Only extract the 'first layer' of embedding, returning raw (filename, bytes).
    """
    top = bytes(target)
-
    head = top[:8]
    out: List[Tuple[str, bytes]] = []
    seen = set()

    def push(b: bytes, name_hint: str = ""):
        h10 = _sha10(b)
-        if h10 in seen: 
+        if h10 in seen:
            return
        seen.add(h10)
        ext = _guess_ext(b)
-        # If name_hint does not have a clear extension, use the extension guessed from the content
+        # If name_hint has an extension use its basename; else fallback to guessed ext
        if "." in name_hint:
            fname = name_hint.split("/")[-1]
        else:
@ -382,8 +382,10 @@ def extract_embed_file(target: Union[bytes, bytearray]) -> List[Tuple[str, bytes
    if _is_zip(head):
        try:
            with zipfile.ZipFile(io.BytesIO(top), "r") as z:
-                embed_dirs = ("word/embeddings/", "word/objects/", "word/activex/",
-                              "xl/embeddings/", "ppt/embeddings/")
+                embed_dirs = (
+                    "word/embeddings/", "word/objects/", "word/activex/",
+                    "xl/embeddings/", "ppt/embeddings/"
+                )
                for name in z.namelist():
                    low = name.lower()
                    if any(low.startswith(d) for d in embed_dirs):
@ -396,24 +398,7 @@ def extract_embed_file(target: Union[bytes, bytearray]) -> List[Tuple[str, bytes
            pass
        return out

-    # PDF attachments
-    if _is_pdf(head):
-        try:
-            doc = fitz.open(stream=top, filetype="pdf")
-            count = getattr(doc, "embfile_count", 0)
-            for i in range(count):
-                try:
-                    data = doc.embfile_get(i)
-                    if data:
-                        push(bytes(data), f"EmbeddedFiles[{i}]")
-                except Exception:
-                    pass
-            doc.close()
-        except Exception:
-            pass
-        return out
-
-    # Legacy OLE (.doc/.xls/.ppt)
+    # OLE container (doc/ppt/xls)
    if _is_ole(head):
        try:
            with olefile.OleFileIO(io.BytesIO(top)) as ole: