mirror of
https://github.com/infiniflow/ragflow.git
synced 2025-12-08 20:42:30 +08:00
### What problem does this PR solve? Fix: Remove pdf embed support, update based on #10635 ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue)
This commit is contained in:
@ -35,7 +35,6 @@ from typing import List, Union, Tuple
|
||||
|
||||
# Third-party imports
|
||||
import olefile
|
||||
import fitz
|
||||
import pdfplumber
|
||||
from cachetools import LRUCache, cached
|
||||
from PIL import Image
|
||||
@ -299,6 +298,7 @@ def read_potential_broken_pdf(blob):
|
||||
return blob
|
||||
|
||||
|
||||
|
||||
def _is_zip(h: bytes) -> bool:
|
||||
return h.startswith(b"PK\x03\x04") or h.startswith(b"PK\x05\x06") or h.startswith(b"PK\x07\x08")
|
||||
|
||||
@ -317,18 +317,18 @@ def _guess_ext(b: bytes) -> str:
|
||||
try:
|
||||
with zipfile.ZipFile(io.BytesIO(b), "r") as z:
|
||||
names = [n.lower() for n in z.namelist()]
|
||||
if any(n.startswith("word/") for n in names):
|
||||
if any(n.startswith("word/") for n in names):
|
||||
return ".docx"
|
||||
if any(n.startswith("ppt/") for n in names):
|
||||
if any(n.startswith("ppt/") for n in names):
|
||||
return ".pptx"
|
||||
if any(n.startswith("xl/") for n in names):
|
||||
if any(n.startswith("xl/") for n in names):
|
||||
return ".xlsx"
|
||||
except Exception:
|
||||
pass
|
||||
return ".zip"
|
||||
if _is_pdf(h):
|
||||
if _is_pdf(h):
|
||||
return ".pdf"
|
||||
if _is_ole(h):
|
||||
if _is_ole(h):
|
||||
return ".doc"
|
||||
return ".bin"
|
||||
|
||||
@ -336,19 +336,21 @@ def _guess_ext(b: bytes) -> str:
|
||||
def _extract_ole10native_payload(data: bytes) -> bytes:
|
||||
try:
|
||||
pos = 0
|
||||
if len(data) < 4:
|
||||
if len(data) < 4:
|
||||
return data
|
||||
_ = int.from_bytes(data[pos:pos+4], "little")
|
||||
pos += 4
|
||||
for _ in range(3): # filename/src/tmp (NUL-terminated ANSI)
|
||||
# filename/src/tmp (NUL-terminated ANSI)
|
||||
for _ in range(3):
|
||||
z = data.index(b"\x00", pos)
|
||||
pos = z + 1
|
||||
# skip unknown 4 bytes
|
||||
pos += 4
|
||||
if pos + 4 > len(data):
|
||||
if pos + 4 > len(data):
|
||||
return data
|
||||
size = int.from_bytes(data[pos:pos+4], "little")
|
||||
pos += 4
|
||||
if pos + size <= len(data):
|
||||
if pos + size <= len(data):
|
||||
return data[pos:pos+size]
|
||||
except Exception:
|
||||
pass
|
||||
@ -356,22 +358,20 @@ def _extract_ole10native_payload(data: bytes) -> bytes:
|
||||
|
||||
def extract_embed_file(target: Union[bytes, bytearray]) -> List[Tuple[str, bytes]]:
|
||||
"""
|
||||
Only extract the "first layer" of embedding, returning raw (filename, bytes).
|
||||
These bytes can be directly used for io.BytesIO and then passed to zipfile/fitz/olefile and other libraries for further parsing.
|
||||
Only extract the 'first layer' of embedding, returning raw (filename, bytes).
|
||||
"""
|
||||
top = bytes(target)
|
||||
|
||||
head = top[:8]
|
||||
out: List[Tuple[str, bytes]] = []
|
||||
seen = set()
|
||||
|
||||
def push(b: bytes, name_hint: str = ""):
|
||||
h10 = _sha10(b)
|
||||
if h10 in seen:
|
||||
if h10 in seen:
|
||||
return
|
||||
seen.add(h10)
|
||||
ext = _guess_ext(b)
|
||||
# If name_hint does not have a clear extension, use the extension guessed from the content
|
||||
# If name_hint has an extension use its basename; else fallback to guessed ext
|
||||
if "." in name_hint:
|
||||
fname = name_hint.split("/")[-1]
|
||||
else:
|
||||
@ -382,8 +382,10 @@ def extract_embed_file(target: Union[bytes, bytearray]) -> List[Tuple[str, bytes
|
||||
if _is_zip(head):
|
||||
try:
|
||||
with zipfile.ZipFile(io.BytesIO(top), "r") as z:
|
||||
embed_dirs = ("word/embeddings/", "word/objects/", "word/activex/",
|
||||
"xl/embeddings/", "ppt/embeddings/")
|
||||
embed_dirs = (
|
||||
"word/embeddings/", "word/objects/", "word/activex/",
|
||||
"xl/embeddings/", "ppt/embeddings/"
|
||||
)
|
||||
for name in z.namelist():
|
||||
low = name.lower()
|
||||
if any(low.startswith(d) for d in embed_dirs):
|
||||
@ -396,24 +398,7 @@ def extract_embed_file(target: Union[bytes, bytearray]) -> List[Tuple[str, bytes
|
||||
pass
|
||||
return out
|
||||
|
||||
# PDF attachments
|
||||
if _is_pdf(head):
|
||||
try:
|
||||
doc = fitz.open(stream=top, filetype="pdf")
|
||||
count = getattr(doc, "embfile_count", 0)
|
||||
for i in range(count):
|
||||
try:
|
||||
data = doc.embfile_get(i)
|
||||
if data:
|
||||
push(bytes(data), f"EmbeddedFiles[{i}]")
|
||||
except Exception:
|
||||
pass
|
||||
doc.close()
|
||||
except Exception:
|
||||
pass
|
||||
return out
|
||||
|
||||
# Legacy OLE (.doc/.xls/.ppt)
|
||||
# OLE container (doc/ppt/xls)
|
||||
if _is_ole(head):
|
||||
try:
|
||||
with olefile.OleFileIO(io.BytesIO(top)) as ole:
|
||||
|
||||
Reference in New Issue
Block a user