Refa: remove ppt image. (#12909)

### What problem does this PR solve?

remove `aspose`

### Type of change

- [x] Refactoring
This commit is contained in:
Kevin Hu
2026-01-30 13:35:42 +08:00
committed by GitHub
parent 73645e2f78
commit f1c2fac03e
5 changed files with 13 additions and 76 deletions

View File

@ -28,7 +28,6 @@ ENV DEBIAN_FRONTEND=noninteractive
# Setup apt
# Python package and implicit dependencies:
# opencv-python: libglib2.0-0 libglx-mesa0 libgl1
# aspose-slides: pkg-config libicu-dev libgdiplus libssl1.1_1.1.1f-1ubuntu2_amd64.deb
# python-pptx: default-jdk tika-server-standard-3.2.3.jar
# selenium: libatk-bridge2.0-0 chrome-linux64-121-0-6167-85
# Building C extensions: libpython3-dev libgtk-4-1 libnss3 xdg-utils libgbm-dev
@ -128,8 +127,6 @@ RUN --mount=type=bind,from=infiniflow/ragflow_deps:latest,source=/chromedriver-l
mv chromedriver /usr/local/bin/ && \
rm -f /usr/bin/google-chrome
# https://forum.aspose.com/t/aspose-slides-for-net-no-usable-version-of-libssl-found-with-linux-server/271344/13
# aspose-slides on linux/arm64 is unavailable
RUN --mount=type=bind,from=infiniflow/ragflow_deps:latest,source=/,target=/deps \
if [ "$(uname -m)" = "x86_64" ]; then \
dpkg -i /deps/libssl1.1_1.1.1f-1ubuntu2_amd64.deb; \

View File

@ -84,28 +84,6 @@ def thumbnail_img(filename, blob):
buffered = BytesIO()
image.save(buffered, format="png")
return buffered.getvalue()
elif re.match(r".*\.(ppt|pptx)$", filename):
import aspose.pydrawing as drawing
import aspose.slides as slides
try:
with slides.Presentation(BytesIO(blob)) as presentation:
buffered = BytesIO()
scale = 0.03
img = None
for _ in range(10):
# https://reference.aspose.com/slides/python-net/aspose.slides/slide/get_thumbnail/#float-float
presentation.slides[0].get_thumbnail(scale, scale).save(buffered, drawing.imaging.ImageFormat.png)
img = buffered.getvalue()
if len(img) >= 64000:
scale = scale / 2.0
buffered = BytesIO()
else:
break
return img
except Exception:
pass
return None

View File

@ -11,7 +11,6 @@ dependencies = [
"akshare>=1.15.78,<2.0.0",
"anthropic==0.34.1",
"arxiv==2.1.3",
"aspose-slides==24.7.0; platform_machine == 'x86_64' or (sys_platform == 'darwin' and platform_machine == 'arm64')",
"atlassian-python-api==4.0.7",
"azure-identity==1.17.1",
"azure-storage-file-datalake==12.16.0",

View File

@ -23,36 +23,13 @@ from PIL import Image
from PyPDF2 import PdfReader as pdf2_read
from deepdoc.parser import PdfParser, PptParser, PlainParser
from deepdoc.parser.ppt_parser import RAGFlowPptParser
from rag.app.naive import by_plaintext, PARSERS
from common.parser_config_utils import normalize_layout_recognizer
from rag.nlp import rag_tokenizer
from rag.nlp import tokenize, is_english
class Ppt(PptParser):
def __call__(self, fnm, from_page, to_page, callback=None):
txts = super().__call__(fnm, from_page, to_page)
callback(0.5, "Text extraction finished.")
import aspose.slides as slides
import aspose.pydrawing as drawing
imgs = []
with slides.Presentation(BytesIO(fnm)) as presentation:
for i, slide in enumerate(presentation.slides[from_page:to_page]):
try:
with BytesIO() as buffered:
slide.get_thumbnail(0.1, 0.1).save(buffered, drawing.imaging.ImageFormat.jpeg)
buffered.seek(0)
imgs.append(Image.open(buffered).copy())
except RuntimeError as e:
raise RuntimeError(f"ppt parse error at page {i + 1}, original error: {str(e)}") from e
assert len(imgs) == len(txts), "Slides text and image do not match: {} vs. {}".format(len(imgs), len(txts))
callback(0.9, "Image extraction finished")
self.is_english = is_english(txts)
return [(txts[i], imgs[i]) for i in range(len(txts))]
class Pdf(PdfParser):
def __init__(self):
super().__init__()
@ -159,15 +136,14 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
res = []
if re.search(r"\.pptx?$", filename, re.IGNORECASE):
ppt_parser = Ppt()
for pn, (txt, img) in enumerate(ppt_parser(filename if not binary else binary, from_page, 1000000, callback)):
ppt_parser = RAGFlowPptParser()
for pn, txt in enumerate(ppt_parser(filename if not binary else binary, from_page, 1000000, callback)):
d = copy.deepcopy(doc)
pn += from_page
d["image"] = img
d["doc_type_kwd"] = "image"
d["page_num_int"] = [pn + 1]
d["top_int"] = [0]
d["position_int"] = [(pn + 1, 0, img.size[0], 0, img.size[1])]
d["position_int"] = [(pn + 1, 0, 0, 0, 0)]
tokenize(d, txt, eng)
res.append(d)
return res

31
uv.lock generated
View File

@ -35,7 +35,7 @@ wheels = [
[[package]]
name = "agentrun-sdk"
version = "0.0.16"
version = "0.0.17"
source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" }
dependencies = [
{ name = "agentrun-mem0ai" },
@ -51,9 +51,9 @@ dependencies = [
{ name = "python-dotenv" },
{ name = "typing-extensions" },
]
sdist = { url = "https://pypi.tuna.tsinghua.edu.cn/packages/34/99/e4651b65a6e52a6547e17c97efc96660f6476ffa49b178253ef62d9982bd/agentrun_sdk-0.0.16.tar.gz", hash = "sha256:73900293aaa6be4d6c7304870b662e302c86f817ebe280ed34c53ea2fe054cc9", size = 232813, upload-time = "2026-01-22T09:28:32.558Z" }
sdist = { url = "https://pypi.tuna.tsinghua.edu.cn/packages/a7/26/77f2e2e9ee8d2caec776a8a4e5bc0f2d2e5b550152fec61721684f29e819/agentrun_sdk-0.0.17.tar.gz", hash = "sha256:cb0362487d0cbe0a11b21f4e12071e4dfcf9666a13e42c1bccee2d8948411ef9", size = 235373, upload-time = "2026-01-28T12:33:09.501Z" }
wheels = [
{ url = "https://pypi.tuna.tsinghua.edu.cn/packages/5d/94/6ad9dd91195bfdc2e423b30184ce7cbb20e388b4ac1c6251c9e1ca17ea74/agentrun_sdk-0.0.16-py3-none-any.whl", hash = "sha256:a6dafef9f71c28e5bbc682d33258050a858d68d33a32070332e36c35cdf28720", size = 316358, upload-time = "2026-01-22T09:28:31.167Z" },
{ url = "https://pypi.tuna.tsinghua.edu.cn/packages/b0/52/28d808d37d272d52f3aec56f7433f9aff43bdb83224a7c715c65568ac53b/agentrun_sdk-0.0.17-py3-none-any.whl", hash = "sha256:19b1ca5e49b57000973d1f755b540cdb92ecb97084891234808a20be7e72aed6", size = 318809, upload-time = "2026-01-28T12:33:07.87Z" },
]
[[package]]
@ -260,28 +260,28 @@ wheels = [
[[package]]
name = "alibabacloud-agentrun20250910"
version = "5.3.3"
version = "5.3.4"
source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" }
dependencies = [
{ name = "alibabacloud-tea-openapi" },
{ name = "darabonba-core" },
]
sdist = { url = "https://pypi.tuna.tsinghua.edu.cn/packages/1d/41/90db47e8a912a1f98d84cdda19850ee6c73d1fef82ad3403ba87872bef0f/alibabacloud_agentrun20250910-5.3.3.tar.gz", hash = "sha256:8615c288a2812f231fe854f8cff0bfac2e18276a6758d1794a58a6bedb6ecc76", size = 86201, upload-time = "2026-01-26T17:30:09.885Z" }
sdist = { url = "https://pypi.tuna.tsinghua.edu.cn/packages/7e/97/d4d72b7a100ae686aab2c83f1388483508fa0f3ccf1259626b18d94cd74a/alibabacloud_agentrun20250910-5.3.4.tar.gz", hash = "sha256:3ea8fd0bfebc07aede3ca55a4b189f4e0be382eaf0e58df098d1ecdcc971bed1", size = 86441, upload-time = "2026-01-28T13:20:11.535Z" }
wheels = [
{ url = "https://pypi.tuna.tsinghua.edu.cn/packages/90/39/1d92863f2fc4210ee376b2d95b2219ebe2d6d0c4cf0c07f60123a299121e/alibabacloud_agentrun20250910-5.3.3-py3-none-any.whl", hash = "sha256:87d1ed906f431ef479b01fb6dfe9151829f69a177c35c397a6b6878d57f5ad38", size = 281038, upload-time = "2026-01-26T17:30:08.739Z" },
{ url = "https://pypi.tuna.tsinghua.edu.cn/packages/8e/bb/1dac68128e71da7974fef1c89b2af3981326ae5d0062e06a94798db9b39a/alibabacloud_agentrun20250910-5.3.4-py3-none-any.whl", hash = "sha256:7e3f708aaa94680360ec98478f705495952bb603495863bf0eadd92fe09e728c", size = 281312, upload-time = "2026-01-28T13:20:10.019Z" },
]
[[package]]
name = "alibabacloud-bailian20231229"
version = "2.8.0"
version = "2.8.1"
source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" }
dependencies = [
{ name = "alibabacloud-tea-openapi" },
{ name = "darabonba-core" },
]
sdist = { url = "https://pypi.tuna.tsinghua.edu.cn/packages/09/ec/84096b2218491574ede0ceec85bc6fbe2e9c84ae3805b9a2e3888c5849f2/alibabacloud_bailian20231229-2.8.0.tar.gz", hash = "sha256:7c1db87943ef4a3ba4f04cc5b3c5c0a1de7f74ef730852cd1f55694ea550054f", size = 68014, upload-time = "2026-01-22T03:51:09.751Z" }
sdist = { url = "https://pypi.tuna.tsinghua.edu.cn/packages/b4/65/2aee1e58bb3eec52c4892637ee15c453b0a3c7797b9b68f49bb5e9dd4e60/alibabacloud_bailian20231229-2.8.1.tar.gz", hash = "sha256:d39a79cc11b7bd0cd59054b0c8a943923f4f3330da243c83446524aab4b63ed8", size = 68212, upload-time = "2026-01-29T07:44:52.23Z" }
wheels = [
{ url = "https://pypi.tuna.tsinghua.edu.cn/packages/74/ba/c68aa20f3a1fb7e222e38f37a8efb1d5f139a256ad89a4622cacf1b21756/alibabacloud_bailian20231229-2.8.0-py3-none-any.whl", hash = "sha256:8a78464ddb0de89e966a6bbd082da677099af3f44c2ae96eb327553fe9c7e1b6", size = 176573, upload-time = "2026-01-22T03:51:08.747Z" },
{ url = "https://pypi.tuna.tsinghua.edu.cn/packages/50/c6/97c771aa4305844c20549c61f79dc48ed418838fed77608240475f4d53cb/alibabacloud_bailian20231229-2.8.1-py3-none-any.whl", hash = "sha256:403678010e65412ee5f0f80c2a831bb50d5e4178f9e616c21fc2793232f25913", size = 176806, upload-time = "2026-01-29T07:44:50.762Z" },
]
[[package]]
@ -532,17 +532,6 @@ wheels = [
{ url = "https://pypi.tuna.tsinghua.edu.cn/packages/26/5e/337125441af40aba86b087dee3dbe829413b6e42eac74defae2076926dbe/asana-5.2.2-py3-none-any.whl", hash = "sha256:1c8d15949a6cb9aa12363a5b7cfc6c0544cb3ae77290dd2e3255c0ec70668458", size = 203161, upload-time = "2025-09-24T21:31:02.401Z" },
]
[[package]]
name = "aspose-slides"
version = "24.7.0"
source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" }
wheels = [
{ url = "https://pypi.tuna.tsinghua.edu.cn/packages/ea/8f/aed51648b153c854841e882b93ab01b671a6fc4e01860450bfe21e957aa7/Aspose.Slides-24.7.0-py3-none-macosx_10_14_x86_64.whl", hash = "sha256:ad1386d88539fd5ba1639ea420387d88a0ef79bea265d79d453452764cf63530", size = 82204653, upload-time = "2024-07-19T09:58:13.084Z" },
{ url = "https://pypi.tuna.tsinghua.edu.cn/packages/97/b1/6e012da70b68c3eae23daeeec3fe4c7e11fa62af84a3ece37a660c1a488c/Aspose.Slides-24.7.0-py3-none-macosx_11_0_arm64.whl", hash = "sha256:d8025282e687a1eae80be8e92250aa91f5e0725a568627597eca1477a0a4256d", size = 60041209, upload-time = "2024-07-19T09:58:19.508Z" },
{ url = "https://pypi.tuna.tsinghua.edu.cn/packages/1a/8c/48e760f52f46dad428fef6b7929b3ed4cfad89c2ec1b314ce7ad064d7314/Aspose.Slides-24.7.0-py3-none-manylinux1_x86_64.whl", hash = "sha256:5793cd178a7460a0ebcc10acd77600d8ce420f844a50cb640743aa2a7878089e", size = 95758565, upload-time = "2024-07-19T09:58:25.163Z" },
{ url = "https://pypi.tuna.tsinghua.edu.cn/packages/d8/04/c5af29852f2475c7433092c5c7701e029e1191661e8127ec72588fd720d4/Aspose.Slides-24.7.0-py3-none-win_amd64.whl", hash = "sha256:db9246fcdfcf54a1501608bd599a4b531afe753a8c23b19f53f0f48f0550712a", size = 68831159, upload-time = "2024-07-19T09:58:36.269Z" },
]
[[package]]
name = "atlassian-python-api"
version = "4.0.7"
@ -6123,7 +6112,6 @@ dependencies = [
{ name = "anthropic" },
{ name = "arxiv" },
{ name = "asana" },
{ name = "aspose-slides", marker = "platform_machine == 'x86_64' or (platform_machine == 'arm64' and sys_platform == 'darwin')" },
{ name = "atlassian-python-api" },
{ name = "azure-identity" },
{ name = "azure-storage-file-datalake" },
@ -6258,7 +6246,6 @@ requires-dist = [
{ name = "anthropic", specifier = "==0.34.1" },
{ name = "arxiv", specifier = "==2.1.3" },
{ name = "asana", specifier = ">=5.2.2" },
{ name = "aspose-slides", marker = "platform_machine == 'x86_64' or (platform_machine == 'arm64' and sys_platform == 'darwin')", specifier = "==24.7.0" },
{ name = "atlassian-python-api", specifier = "==4.0.7" },
{ name = "azure-identity", specifier = "==1.17.1" },
{ name = "azure-storage-file-datalake", specifier = "==12.16.0" },