diff --git a/api/utils/file_utils.py b/api/utils/file_utils.py index 71389c01a..af325baf0 100644 --- a/api/utils/file_utils.py +++ b/api/utils/file_utils.py @@ -35,7 +35,6 @@ from typing import List, Union, Tuple # Third-party imports import olefile -import fitz import pdfplumber from cachetools import LRUCache, cached from PIL import Image @@ -299,6 +298,7 @@ def read_potential_broken_pdf(blob): return blob + def _is_zip(h: bytes) -> bool: return h.startswith(b"PK\x03\x04") or h.startswith(b"PK\x05\x06") or h.startswith(b"PK\x07\x08") @@ -317,18 +317,18 @@ def _guess_ext(b: bytes) -> str: try: with zipfile.ZipFile(io.BytesIO(b), "r") as z: names = [n.lower() for n in z.namelist()] - if any(n.startswith("word/") for n in names): + if any(n.startswith("word/") for n in names): return ".docx" - if any(n.startswith("ppt/") for n in names): + if any(n.startswith("ppt/") for n in names): return ".pptx" - if any(n.startswith("xl/") for n in names): + if any(n.startswith("xl/") for n in names): return ".xlsx" except Exception: pass return ".zip" - if _is_pdf(h): + if _is_pdf(h): return ".pdf" - if _is_ole(h): + if _is_ole(h): return ".doc" return ".bin" @@ -336,19 +336,21 @@ def _guess_ext(b: bytes) -> str: def _extract_ole10native_payload(data: bytes) -> bytes: try: pos = 0 - if len(data) < 4: + if len(data) < 4: return data _ = int.from_bytes(data[pos:pos+4], "little") pos += 4 - for _ in range(3): # filename/src/tmp (NUL-terminated ANSI) + # filename/src/tmp (NUL-terminated ANSI) + for _ in range(3): z = data.index(b"\x00", pos) pos = z + 1 + # skip unknown 4 bytes pos += 4 - if pos + 4 > len(data): + if pos + 4 > len(data): return data size = int.from_bytes(data[pos:pos+4], "little") pos += 4 - if pos + size <= len(data): + if pos + size <= len(data): return data[pos:pos+size] except Exception: pass @@ -356,22 +358,20 @@ def _extract_ole10native_payload(data: bytes) -> bytes: def extract_embed_file(target: Union[bytes, bytearray]) -> List[Tuple[str, bytes]]: """ - Only extract the "first layer" of embedding, returning raw (filename, bytes). - These bytes can be directly used for io.BytesIO and then passed to zipfile/fitz/olefile and other libraries for further parsing. + Only extract the 'first layer' of embedding, returning raw (filename, bytes). """ top = bytes(target) - head = top[:8] out: List[Tuple[str, bytes]] = [] seen = set() def push(b: bytes, name_hint: str = ""): h10 = _sha10(b) - if h10 in seen: + if h10 in seen: return seen.add(h10) ext = _guess_ext(b) - # If name_hint does not have a clear extension, use the extension guessed from the content + # If name_hint has an extension use its basename; else fallback to guessed ext if "." in name_hint: fname = name_hint.split("/")[-1] else: @@ -382,8 +382,10 @@ def extract_embed_file(target: Union[bytes, bytearray]) -> List[Tuple[str, bytes if _is_zip(head): try: with zipfile.ZipFile(io.BytesIO(top), "r") as z: - embed_dirs = ("word/embeddings/", "word/objects/", "word/activex/", - "xl/embeddings/", "ppt/embeddings/") + embed_dirs = ( + "word/embeddings/", "word/objects/", "word/activex/", + "xl/embeddings/", "ppt/embeddings/" + ) for name in z.namelist(): low = name.lower() if any(low.startswith(d) for d in embed_dirs): @@ -396,24 +398,7 @@ def extract_embed_file(target: Union[bytes, bytearray]) -> List[Tuple[str, bytes pass return out - # PDF attachments - if _is_pdf(head): - try: - doc = fitz.open(stream=top, filetype="pdf") - count = getattr(doc, "embfile_count", 0) - for i in range(count): - try: - data = doc.embfile_get(i) - if data: - push(bytes(data), f"EmbeddedFiles[{i}]") - except Exception: - pass - doc.close() - except Exception: - pass - return out - - # Legacy OLE (.doc/.xls/.ppt) + # OLE container (doc/ppt/xls) if _is_ole(head): try: with olefile.OleFileIO(io.BytesIO(top)) as ole: diff --git a/pyproject.toml b/pyproject.toml index 4237e692d..486de7b40 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -137,7 +137,6 @@ dependencies = [ "mammoth>=1.11.0", "markdownify>=1.2.0", "captcha>=0.7.1", - "fitz>=0.0.1.dev2", ] [project.optional-dependencies] diff --git a/uv.lock b/uv.lock index db5e28832..732662c81 100644 --- a/uv.lock +++ b/uv.lock @@ -1708,25 +1708,6 @@ wheels = [ { url = "https://pypi.tuna.tsinghua.edu.cn/packages/ae/f0/48285f0262fe47103a4a45972ed2f9b93e4c80b8fd609fa98da78b2a5706/filelock-3.15.4-py3-none-any.whl", hash = "sha256:6ca1fffae96225dab4c6eaf1c4f4f28cd2568d3ec2a44e15a08520504de468e7", size = 16159, upload-time = "2024-06-22T15:59:12.695Z" }, ] -[[package]] -name = "fitz" -version = "0.0.1.dev2" -source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" } -dependencies = [ - { name = "configobj" }, - { name = "configparser" }, - { name = "httplib2" }, - { name = "nibabel" }, - { name = "nipype" }, - { name = "numpy" }, - { name = "pandas" }, - { name = "pyxnat" }, - { name = "scipy" }, -] -wheels = [ - { url = "https://pypi.tuna.tsinghua.edu.cn/packages/7e/28/27f27d66eb82f24e6595deb26c0a875e62431878c416e38eac515023abb2/fitz-0.0.1.dev2-py2.py3-none-any.whl", hash = "sha256:3b75083d58068d9bd51695eb2f78c9c92094cd6c8dada839e93edcddf18c0c5c", size = 20003, upload-time = "2017-02-25T23:29:54.403Z" }, -] - [[package]] name = "flagembedding" version = "1.2.10" @@ -5237,21 +5218,6 @@ crypto = [ { name = "cryptography" }, ] -[[package]] -name = "pymupdf" -version = "1.26.5" -source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" } -sdist = { url = "https://pypi.tuna.tsinghua.edu.cn/packages/8d/9a/e0a4e92a85fc17be7c54afdbb113f0ade2a8bca49856d510e28bd249e462/pymupdf-1.26.5.tar.gz", hash = "sha256:8ef335e07f648492df240f2247854d0e7c0467afb9c4dc2376ec30978ec158c3", size = 84319274, upload-time = "2025-10-10T14:04:51.826Z" } -wheels = [ - { url = "https://pypi.tuna.tsinghua.edu.cn/packages/dd/3f/7fc927fd66922ce838d4c974ff9a685c5f5aba108a5d94914dc05c9371f5/pymupdf-1.26.5-cp39-abi3-macosx_10_9_x86_64.whl", hash = "sha256:2bfb58f07ad631e5f71ad0bd6f1ff52700f7ba7ebb4973130e81e75b721beae1", size = 23065601, upload-time = "2025-10-10T13:58:43.98Z" }, - { url = "https://pypi.tuna.tsinghua.edu.cn/packages/c1/e2/e87e62284ba98d59f1fd4fc7542ef2ed0002525754a485fa4077b3bbddae/pymupdf-1.26.5-cp39-abi3-macosx_11_0_arm64.whl", hash = "sha256:d58599479bc471d3ae56c3d68d9160d0b7de8a3bd40221ddc3a4eaae2d281b86", size = 22412612, upload-time = "2025-10-10T13:59:04.846Z" }, - { url = "https://pypi.tuna.tsinghua.edu.cn/packages/df/c2/af93c6367f79e9b5435f803bde51c1dc8225f054f8238162dda80b44986d/pymupdf-1.26.5-cp39-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:7dfea81fdd73437a6a6ce83e1fcf556faee9327a6540571e58bf04fa362bb0cd", size = 23457410, upload-time = "2025-10-10T22:45:26.355Z" }, - { url = "https://pypi.tuna.tsinghua.edu.cn/packages/5b/5a/1292a0df4ff71fbc00dfa8c08759d17c97e1e8ea9277eb5bc5f079ca188d/pymupdf-1.26.5-cp39-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:caad0ffeb63dcc4a29ca40f3c68d7b78d32a932e834b0056b529cc0bdbaaffc9", size = 24064941, upload-time = "2025-10-10T13:59:48.544Z" }, - { url = "https://pypi.tuna.tsinghua.edu.cn/packages/28/90/87b7fdfc9cd6991a3eb69a5752f6343374c34f258c511c242f4d60791eea/pymupdf-1.26.5-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:e24e7a7d696bd398543cc5c147869edb2026d5d5a21b7f8e35db2f20170b389e", size = 24268203, upload-time = "2025-10-10T14:00:28.791Z" }, - { url = "https://pypi.tuna.tsinghua.edu.cn/packages/2c/99/9d4b36485538e29df0a013fb02bbf6b5b0743a428fa07515e36631c43363/pymupdf-1.26.5-cp39-abi3-win32.whl", hash = "sha256:a2a42f5911d153a47bf5c3e162a0bfe8745eb9bec3e59fbaf87617b4003d8270", size = 17130722, upload-time = "2025-10-10T14:00:51.377Z" }, - { url = "https://pypi.tuna.tsinghua.edu.cn/packages/c6/96/fd59c1532891762ea4815e73956c532053d5e26d56969e1e5d1e4ca4b207/pymupdf-1.26.5-cp39-abi3-win_amd64.whl", hash = "sha256:39a6fb58182b27b51ea8150a0cd2e4ee7e0cf71e9d6723978f28699b42ee61ae", size = 18747258, upload-time = "2025-10-10T14:01:37.346Z" }, -] - [[package]] name = "pymysql" version = "1.1.1" @@ -5706,7 +5672,6 @@ dependencies = [ { name = "elasticsearch-dsl" }, { name = "extract-msg" }, { name = "filelock" }, - { name = "fitz" }, { name = "flasgger" }, { name = "flask" }, { name = "flask-cors" }, @@ -5759,7 +5724,6 @@ dependencies = [ { name = "pyclipper" }, { name = "pycryptodomex" }, { name = "pyicu" }, - { name = "pymupdf" }, { name = "pymysql" }, { name = "pyodbc" }, { name = "pypdf" }, @@ -5868,7 +5832,6 @@ requires-dist = [ { name = "fastembed", marker = "(platform_machine != 'x86_64' and extra == 'full') or (sys_platform == 'darwin' and extra == 'full')", specifier = ">=0.3.6,<0.4.0" }, { name = "fastembed-gpu", marker = "platform_machine == 'x86_64' and sys_platform != 'darwin' and extra == 'full'", specifier = ">=0.3.6,<0.4.0" }, { name = "filelock", specifier = "==3.15.4" }, - { name = "fitz", specifier = ">=0.0.1.dev2" }, { name = "flagembedding", marker = "extra == 'full'", specifier = "==1.2.10" }, { name = "flasgger", specifier = ">=0.9.7.1,<0.10.0" }, { name = "flask", specifier = "==3.0.3" }, @@ -5922,7 +5885,6 @@ requires-dist = [ { name = "pyclipper", specifier = "==1.3.0.post5" }, { name = "pycryptodomex", specifier = "==3.20.0" }, { name = "pyicu", specifier = ">=2.15.3,<3.0.0" }, - { name = "pymupdf", specifier = ">=1.26.5" }, { name = "pymysql", specifier = ">=1.1.1,<2.0.0" }, { name = "pyodbc", specifier = ">=5.2.0,<6.0.0" }, { name = "pypdf", specifier = "==6.0.0" },