mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-01-30 23:26:36 +08:00
Refa: remove ppt image. (#12909)
### What problem does this PR solve? remove `aspose` ### Type of change - [x] Refactoring
This commit is contained in:
@ -28,7 +28,6 @@ ENV DEBIAN_FRONTEND=noninteractive
|
||||
# Setup apt
|
||||
# Python package and implicit dependencies:
|
||||
# opencv-python: libglib2.0-0 libglx-mesa0 libgl1
|
||||
# aspose-slides: pkg-config libicu-dev libgdiplus libssl1.1_1.1.1f-1ubuntu2_amd64.deb
|
||||
# python-pptx: default-jdk tika-server-standard-3.2.3.jar
|
||||
# selenium: libatk-bridge2.0-0 chrome-linux64-121-0-6167-85
|
||||
# Building C extensions: libpython3-dev libgtk-4-1 libnss3 xdg-utils libgbm-dev
|
||||
@ -128,8 +127,6 @@ RUN --mount=type=bind,from=infiniflow/ragflow_deps:latest,source=/chromedriver-l
|
||||
mv chromedriver /usr/local/bin/ && \
|
||||
rm -f /usr/bin/google-chrome
|
||||
|
||||
# https://forum.aspose.com/t/aspose-slides-for-net-no-usable-version-of-libssl-found-with-linux-server/271344/13
|
||||
# aspose-slides on linux/arm64 is unavailable
|
||||
RUN --mount=type=bind,from=infiniflow/ragflow_deps:latest,source=/,target=/deps \
|
||||
if [ "$(uname -m)" = "x86_64" ]; then \
|
||||
dpkg -i /deps/libssl1.1_1.1.1f-1ubuntu2_amd64.deb; \
|
||||
|
||||
@ -84,28 +84,6 @@ def thumbnail_img(filename, blob):
|
||||
buffered = BytesIO()
|
||||
image.save(buffered, format="png")
|
||||
return buffered.getvalue()
|
||||
|
||||
elif re.match(r".*\.(ppt|pptx)$", filename):
|
||||
import aspose.pydrawing as drawing
|
||||
import aspose.slides as slides
|
||||
|
||||
try:
|
||||
with slides.Presentation(BytesIO(blob)) as presentation:
|
||||
buffered = BytesIO()
|
||||
scale = 0.03
|
||||
img = None
|
||||
for _ in range(10):
|
||||
# https://reference.aspose.com/slides/python-net/aspose.slides/slide/get_thumbnail/#float-float
|
||||
presentation.slides[0].get_thumbnail(scale, scale).save(buffered, drawing.imaging.ImageFormat.png)
|
||||
img = buffered.getvalue()
|
||||
if len(img) >= 64000:
|
||||
scale = scale / 2.0
|
||||
buffered = BytesIO()
|
||||
else:
|
||||
break
|
||||
return img
|
||||
except Exception:
|
||||
pass
|
||||
return None
|
||||
|
||||
|
||||
|
||||
@ -11,7 +11,6 @@ dependencies = [
|
||||
"akshare>=1.15.78,<2.0.0",
|
||||
"anthropic==0.34.1",
|
||||
"arxiv==2.1.3",
|
||||
"aspose-slides==24.7.0; platform_machine == 'x86_64' or (sys_platform == 'darwin' and platform_machine == 'arm64')",
|
||||
"atlassian-python-api==4.0.7",
|
||||
"azure-identity==1.17.1",
|
||||
"azure-storage-file-datalake==12.16.0",
|
||||
|
||||
@ -23,36 +23,13 @@ from PIL import Image
|
||||
from PyPDF2 import PdfReader as pdf2_read
|
||||
|
||||
from deepdoc.parser import PdfParser, PptParser, PlainParser
|
||||
from deepdoc.parser.ppt_parser import RAGFlowPptParser
|
||||
from rag.app.naive import by_plaintext, PARSERS
|
||||
from common.parser_config_utils import normalize_layout_recognizer
|
||||
from rag.nlp import rag_tokenizer
|
||||
from rag.nlp import tokenize, is_english
|
||||
|
||||
|
||||
class Ppt(PptParser):
|
||||
def __call__(self, fnm, from_page, to_page, callback=None):
|
||||
txts = super().__call__(fnm, from_page, to_page)
|
||||
|
||||
callback(0.5, "Text extraction finished.")
|
||||
import aspose.slides as slides
|
||||
import aspose.pydrawing as drawing
|
||||
|
||||
imgs = []
|
||||
with slides.Presentation(BytesIO(fnm)) as presentation:
|
||||
for i, slide in enumerate(presentation.slides[from_page:to_page]):
|
||||
try:
|
||||
with BytesIO() as buffered:
|
||||
slide.get_thumbnail(0.1, 0.1).save(buffered, drawing.imaging.ImageFormat.jpeg)
|
||||
buffered.seek(0)
|
||||
imgs.append(Image.open(buffered).copy())
|
||||
except RuntimeError as e:
|
||||
raise RuntimeError(f"ppt parse error at page {i + 1}, original error: {str(e)}") from e
|
||||
assert len(imgs) == len(txts), "Slides text and image do not match: {} vs. {}".format(len(imgs), len(txts))
|
||||
callback(0.9, "Image extraction finished")
|
||||
self.is_english = is_english(txts)
|
||||
return [(txts[i], imgs[i]) for i in range(len(txts))]
|
||||
|
||||
|
||||
class Pdf(PdfParser):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
@ -159,15 +136,14 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
|
||||
doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
|
||||
res = []
|
||||
if re.search(r"\.pptx?$", filename, re.IGNORECASE):
|
||||
ppt_parser = Ppt()
|
||||
for pn, (txt, img) in enumerate(ppt_parser(filename if not binary else binary, from_page, 1000000, callback)):
|
||||
ppt_parser = RAGFlowPptParser()
|
||||
for pn, txt in enumerate(ppt_parser(filename if not binary else binary, from_page, 1000000, callback)):
|
||||
d = copy.deepcopy(doc)
|
||||
pn += from_page
|
||||
d["image"] = img
|
||||
d["doc_type_kwd"] = "image"
|
||||
d["page_num_int"] = [pn + 1]
|
||||
d["top_int"] = [0]
|
||||
d["position_int"] = [(pn + 1, 0, img.size[0], 0, img.size[1])]
|
||||
d["position_int"] = [(pn + 1, 0, 0, 0, 0)]
|
||||
tokenize(d, txt, eng)
|
||||
res.append(d)
|
||||
return res
|
||||
|
||||
31
uv.lock
generated
31
uv.lock
generated
@ -35,7 +35,7 @@ wheels = [
|
||||
|
||||
[[package]]
|
||||
name = "agentrun-sdk"
|
||||
version = "0.0.16"
|
||||
version = "0.0.17"
|
||||
source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" }
|
||||
dependencies = [
|
||||
{ name = "agentrun-mem0ai" },
|
||||
@ -51,9 +51,9 @@ dependencies = [
|
||||
{ name = "python-dotenv" },
|
||||
{ name = "typing-extensions" },
|
||||
]
|
||||
sdist = { url = "https://pypi.tuna.tsinghua.edu.cn/packages/34/99/e4651b65a6e52a6547e17c97efc96660f6476ffa49b178253ef62d9982bd/agentrun_sdk-0.0.16.tar.gz", hash = "sha256:73900293aaa6be4d6c7304870b662e302c86f817ebe280ed34c53ea2fe054cc9", size = 232813, upload-time = "2026-01-22T09:28:32.558Z" }
|
||||
sdist = { url = "https://pypi.tuna.tsinghua.edu.cn/packages/a7/26/77f2e2e9ee8d2caec776a8a4e5bc0f2d2e5b550152fec61721684f29e819/agentrun_sdk-0.0.17.tar.gz", hash = "sha256:cb0362487d0cbe0a11b21f4e12071e4dfcf9666a13e42c1bccee2d8948411ef9", size = 235373, upload-time = "2026-01-28T12:33:09.501Z" }
|
||||
wheels = [
|
||||
{ url = "https://pypi.tuna.tsinghua.edu.cn/packages/5d/94/6ad9dd91195bfdc2e423b30184ce7cbb20e388b4ac1c6251c9e1ca17ea74/agentrun_sdk-0.0.16-py3-none-any.whl", hash = "sha256:a6dafef9f71c28e5bbc682d33258050a858d68d33a32070332e36c35cdf28720", size = 316358, upload-time = "2026-01-22T09:28:31.167Z" },
|
||||
{ url = "https://pypi.tuna.tsinghua.edu.cn/packages/b0/52/28d808d37d272d52f3aec56f7433f9aff43bdb83224a7c715c65568ac53b/agentrun_sdk-0.0.17-py3-none-any.whl", hash = "sha256:19b1ca5e49b57000973d1f755b540cdb92ecb97084891234808a20be7e72aed6", size = 318809, upload-time = "2026-01-28T12:33:07.87Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@ -260,28 +260,28 @@ wheels = [
|
||||
|
||||
[[package]]
|
||||
name = "alibabacloud-agentrun20250910"
|
||||
version = "5.3.3"
|
||||
version = "5.3.4"
|
||||
source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" }
|
||||
dependencies = [
|
||||
{ name = "alibabacloud-tea-openapi" },
|
||||
{ name = "darabonba-core" },
|
||||
]
|
||||
sdist = { url = "https://pypi.tuna.tsinghua.edu.cn/packages/1d/41/90db47e8a912a1f98d84cdda19850ee6c73d1fef82ad3403ba87872bef0f/alibabacloud_agentrun20250910-5.3.3.tar.gz", hash = "sha256:8615c288a2812f231fe854f8cff0bfac2e18276a6758d1794a58a6bedb6ecc76", size = 86201, upload-time = "2026-01-26T17:30:09.885Z" }
|
||||
sdist = { url = "https://pypi.tuna.tsinghua.edu.cn/packages/7e/97/d4d72b7a100ae686aab2c83f1388483508fa0f3ccf1259626b18d94cd74a/alibabacloud_agentrun20250910-5.3.4.tar.gz", hash = "sha256:3ea8fd0bfebc07aede3ca55a4b189f4e0be382eaf0e58df098d1ecdcc971bed1", size = 86441, upload-time = "2026-01-28T13:20:11.535Z" }
|
||||
wheels = [
|
||||
{ url = "https://pypi.tuna.tsinghua.edu.cn/packages/90/39/1d92863f2fc4210ee376b2d95b2219ebe2d6d0c4cf0c07f60123a299121e/alibabacloud_agentrun20250910-5.3.3-py3-none-any.whl", hash = "sha256:87d1ed906f431ef479b01fb6dfe9151829f69a177c35c397a6b6878d57f5ad38", size = 281038, upload-time = "2026-01-26T17:30:08.739Z" },
|
||||
{ url = "https://pypi.tuna.tsinghua.edu.cn/packages/8e/bb/1dac68128e71da7974fef1c89b2af3981326ae5d0062e06a94798db9b39a/alibabacloud_agentrun20250910-5.3.4-py3-none-any.whl", hash = "sha256:7e3f708aaa94680360ec98478f705495952bb603495863bf0eadd92fe09e728c", size = 281312, upload-time = "2026-01-28T13:20:10.019Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "alibabacloud-bailian20231229"
|
||||
version = "2.8.0"
|
||||
version = "2.8.1"
|
||||
source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" }
|
||||
dependencies = [
|
||||
{ name = "alibabacloud-tea-openapi" },
|
||||
{ name = "darabonba-core" },
|
||||
]
|
||||
sdist = { url = "https://pypi.tuna.tsinghua.edu.cn/packages/09/ec/84096b2218491574ede0ceec85bc6fbe2e9c84ae3805b9a2e3888c5849f2/alibabacloud_bailian20231229-2.8.0.tar.gz", hash = "sha256:7c1db87943ef4a3ba4f04cc5b3c5c0a1de7f74ef730852cd1f55694ea550054f", size = 68014, upload-time = "2026-01-22T03:51:09.751Z" }
|
||||
sdist = { url = "https://pypi.tuna.tsinghua.edu.cn/packages/b4/65/2aee1e58bb3eec52c4892637ee15c453b0a3c7797b9b68f49bb5e9dd4e60/alibabacloud_bailian20231229-2.8.1.tar.gz", hash = "sha256:d39a79cc11b7bd0cd59054b0c8a943923f4f3330da243c83446524aab4b63ed8", size = 68212, upload-time = "2026-01-29T07:44:52.23Z" }
|
||||
wheels = [
|
||||
{ url = "https://pypi.tuna.tsinghua.edu.cn/packages/74/ba/c68aa20f3a1fb7e222e38f37a8efb1d5f139a256ad89a4622cacf1b21756/alibabacloud_bailian20231229-2.8.0-py3-none-any.whl", hash = "sha256:8a78464ddb0de89e966a6bbd082da677099af3f44c2ae96eb327553fe9c7e1b6", size = 176573, upload-time = "2026-01-22T03:51:08.747Z" },
|
||||
{ url = "https://pypi.tuna.tsinghua.edu.cn/packages/50/c6/97c771aa4305844c20549c61f79dc48ed418838fed77608240475f4d53cb/alibabacloud_bailian20231229-2.8.1-py3-none-any.whl", hash = "sha256:403678010e65412ee5f0f80c2a831bb50d5e4178f9e616c21fc2793232f25913", size = 176806, upload-time = "2026-01-29T07:44:50.762Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@ -532,17 +532,6 @@ wheels = [
|
||||
{ url = "https://pypi.tuna.tsinghua.edu.cn/packages/26/5e/337125441af40aba86b087dee3dbe829413b6e42eac74defae2076926dbe/asana-5.2.2-py3-none-any.whl", hash = "sha256:1c8d15949a6cb9aa12363a5b7cfc6c0544cb3ae77290dd2e3255c0ec70668458", size = 203161, upload-time = "2025-09-24T21:31:02.401Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "aspose-slides"
|
||||
version = "24.7.0"
|
||||
source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" }
|
||||
wheels = [
|
||||
{ url = "https://pypi.tuna.tsinghua.edu.cn/packages/ea/8f/aed51648b153c854841e882b93ab01b671a6fc4e01860450bfe21e957aa7/Aspose.Slides-24.7.0-py3-none-macosx_10_14_x86_64.whl", hash = "sha256:ad1386d88539fd5ba1639ea420387d88a0ef79bea265d79d453452764cf63530", size = 82204653, upload-time = "2024-07-19T09:58:13.084Z" },
|
||||
{ url = "https://pypi.tuna.tsinghua.edu.cn/packages/97/b1/6e012da70b68c3eae23daeeec3fe4c7e11fa62af84a3ece37a660c1a488c/Aspose.Slides-24.7.0-py3-none-macosx_11_0_arm64.whl", hash = "sha256:d8025282e687a1eae80be8e92250aa91f5e0725a568627597eca1477a0a4256d", size = 60041209, upload-time = "2024-07-19T09:58:19.508Z" },
|
||||
{ url = "https://pypi.tuna.tsinghua.edu.cn/packages/1a/8c/48e760f52f46dad428fef6b7929b3ed4cfad89c2ec1b314ce7ad064d7314/Aspose.Slides-24.7.0-py3-none-manylinux1_x86_64.whl", hash = "sha256:5793cd178a7460a0ebcc10acd77600d8ce420f844a50cb640743aa2a7878089e", size = 95758565, upload-time = "2024-07-19T09:58:25.163Z" },
|
||||
{ url = "https://pypi.tuna.tsinghua.edu.cn/packages/d8/04/c5af29852f2475c7433092c5c7701e029e1191661e8127ec72588fd720d4/Aspose.Slides-24.7.0-py3-none-win_amd64.whl", hash = "sha256:db9246fcdfcf54a1501608bd599a4b531afe753a8c23b19f53f0f48f0550712a", size = 68831159, upload-time = "2024-07-19T09:58:36.269Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "atlassian-python-api"
|
||||
version = "4.0.7"
|
||||
@ -6123,7 +6112,6 @@ dependencies = [
|
||||
{ name = "anthropic" },
|
||||
{ name = "arxiv" },
|
||||
{ name = "asana" },
|
||||
{ name = "aspose-slides", marker = "platform_machine == 'x86_64' or (platform_machine == 'arm64' and sys_platform == 'darwin')" },
|
||||
{ name = "atlassian-python-api" },
|
||||
{ name = "azure-identity" },
|
||||
{ name = "azure-storage-file-datalake" },
|
||||
@ -6258,7 +6246,6 @@ requires-dist = [
|
||||
{ name = "anthropic", specifier = "==0.34.1" },
|
||||
{ name = "arxiv", specifier = "==2.1.3" },
|
||||
{ name = "asana", specifier = ">=5.2.2" },
|
||||
{ name = "aspose-slides", marker = "platform_machine == 'x86_64' or (platform_machine == 'arm64' and sys_platform == 'darwin')", specifier = "==24.7.0" },
|
||||
{ name = "atlassian-python-api", specifier = "==4.0.7" },
|
||||
{ name = "azure-identity", specifier = "==1.17.1" },
|
||||
{ name = "azure-storage-file-datalake", specifier = "==12.16.0" },
|
||||
|
||||
Reference in New Issue
Block a user