diff --git a/Dockerfile b/Dockerfile index c1859878e..b8a8ef5f3 100644 --- a/Dockerfile +++ b/Dockerfile @@ -28,7 +28,6 @@ ENV DEBIAN_FRONTEND=noninteractive # Setup apt # Python package and implicit dependencies: # opencv-python: libglib2.0-0 libglx-mesa0 libgl1 -# aspose-slides: pkg-config libicu-dev libgdiplus libssl1.1_1.1.1f-1ubuntu2_amd64.deb # python-pptx: default-jdk tika-server-standard-3.2.3.jar # selenium: libatk-bridge2.0-0 chrome-linux64-121-0-6167-85 # Building C extensions: libpython3-dev libgtk-4-1 libnss3 xdg-utils libgbm-dev @@ -128,8 +127,6 @@ RUN --mount=type=bind,from=infiniflow/ragflow_deps:latest,source=/chromedriver-l mv chromedriver /usr/local/bin/ && \ rm -f /usr/bin/google-chrome -# https://forum.aspose.com/t/aspose-slides-for-net-no-usable-version-of-libssl-found-with-linux-server/271344/13 -# aspose-slides on linux/arm64 is unavailable RUN --mount=type=bind,from=infiniflow/ragflow_deps:latest,source=/,target=/deps \ if [ "$(uname -m)" = "x86_64" ]; then \ dpkg -i /deps/libssl1.1_1.1.1f-1ubuntu2_amd64.deb; \ diff --git a/api/utils/file_utils.py b/api/utils/file_utils.py index 4cad64c35..e73c5d218 100644 --- a/api/utils/file_utils.py +++ b/api/utils/file_utils.py @@ -84,28 +84,6 @@ def thumbnail_img(filename, blob): buffered = BytesIO() image.save(buffered, format="png") return buffered.getvalue() - - elif re.match(r".*\.(ppt|pptx)$", filename): - import aspose.pydrawing as drawing - import aspose.slides as slides - - try: - with slides.Presentation(BytesIO(blob)) as presentation: - buffered = BytesIO() - scale = 0.03 - img = None - for _ in range(10): - # https://reference.aspose.com/slides/python-net/aspose.slides/slide/get_thumbnail/#float-float - presentation.slides[0].get_thumbnail(scale, scale).save(buffered, drawing.imaging.ImageFormat.png) - img = buffered.getvalue() - if len(img) >= 64000: - scale = scale / 2.0 - buffered = BytesIO() - else: - break - return img - except Exception: - pass return None diff --git a/pyproject.toml b/pyproject.toml index 117a7408b..4ae26ae99 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -11,7 +11,6 @@ dependencies = [ "akshare>=1.15.78,<2.0.0", "anthropic==0.34.1", "arxiv==2.1.3", - "aspose-slides==24.7.0; platform_machine == 'x86_64' or (sys_platform == 'darwin' and platform_machine == 'arm64')", "atlassian-python-api==4.0.7", "azure-identity==1.17.1", "azure-storage-file-datalake==12.16.0", diff --git a/rag/app/presentation.py b/rag/app/presentation.py index e4247e8cc..9b7a8f694 100644 --- a/rag/app/presentation.py +++ b/rag/app/presentation.py @@ -23,36 +23,13 @@ from PIL import Image from PyPDF2 import PdfReader as pdf2_read from deepdoc.parser import PdfParser, PptParser, PlainParser +from deepdoc.parser.ppt_parser import RAGFlowPptParser from rag.app.naive import by_plaintext, PARSERS from common.parser_config_utils import normalize_layout_recognizer from rag.nlp import rag_tokenizer from rag.nlp import tokenize, is_english -class Ppt(PptParser): - def __call__(self, fnm, from_page, to_page, callback=None): - txts = super().__call__(fnm, from_page, to_page) - - callback(0.5, "Text extraction finished.") - import aspose.slides as slides - import aspose.pydrawing as drawing - - imgs = [] - with slides.Presentation(BytesIO(fnm)) as presentation: - for i, slide in enumerate(presentation.slides[from_page:to_page]): - try: - with BytesIO() as buffered: - slide.get_thumbnail(0.1, 0.1).save(buffered, drawing.imaging.ImageFormat.jpeg) - buffered.seek(0) - imgs.append(Image.open(buffered).copy()) - except RuntimeError as e: - raise RuntimeError(f"ppt parse error at page {i + 1}, original error: {str(e)}") from e - assert len(imgs) == len(txts), "Slides text and image do not match: {} vs. {}".format(len(imgs), len(txts)) - callback(0.9, "Image extraction finished") - self.is_english = is_english(txts) - return [(txts[i], imgs[i]) for i in range(len(txts))] - - class Pdf(PdfParser): def __init__(self): super().__init__() @@ -159,15 +136,14 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"]) res = [] if re.search(r"\.pptx?$", filename, re.IGNORECASE): - ppt_parser = Ppt() - for pn, (txt, img) in enumerate(ppt_parser(filename if not binary else binary, from_page, 1000000, callback)): + ppt_parser = RAGFlowPptParser() + for pn, txt in enumerate(ppt_parser(filename if not binary else binary, from_page, 1000000, callback)): d = copy.deepcopy(doc) pn += from_page - d["image"] = img d["doc_type_kwd"] = "image" d["page_num_int"] = [pn + 1] d["top_int"] = [0] - d["position_int"] = [(pn + 1, 0, img.size[0], 0, img.size[1])] + d["position_int"] = [(pn + 1, 0, 0, 0, 0)] tokenize(d, txt, eng) res.append(d) return res diff --git a/uv.lock b/uv.lock index 109a4b1db..a1b0323b2 100644 --- a/uv.lock +++ b/uv.lock @@ -35,7 +35,7 @@ wheels = [ [[package]] name = "agentrun-sdk" -version = "0.0.16" +version = "0.0.17" source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" } dependencies = [ { name = "agentrun-mem0ai" }, @@ -51,9 +51,9 @@ dependencies = [ { name = "python-dotenv" }, { name = "typing-extensions" }, ] -sdist = { url = "https://pypi.tuna.tsinghua.edu.cn/packages/34/99/e4651b65a6e52a6547e17c97efc96660f6476ffa49b178253ef62d9982bd/agentrun_sdk-0.0.16.tar.gz", hash = "sha256:73900293aaa6be4d6c7304870b662e302c86f817ebe280ed34c53ea2fe054cc9", size = 232813, upload-time = "2026-01-22T09:28:32.558Z" } +sdist = { url = "https://pypi.tuna.tsinghua.edu.cn/packages/a7/26/77f2e2e9ee8d2caec776a8a4e5bc0f2d2e5b550152fec61721684f29e819/agentrun_sdk-0.0.17.tar.gz", hash = "sha256:cb0362487d0cbe0a11b21f4e12071e4dfcf9666a13e42c1bccee2d8948411ef9", size = 235373, upload-time = "2026-01-28T12:33:09.501Z" } wheels = [ - { url = "https://pypi.tuna.tsinghua.edu.cn/packages/5d/94/6ad9dd91195bfdc2e423b30184ce7cbb20e388b4ac1c6251c9e1ca17ea74/agentrun_sdk-0.0.16-py3-none-any.whl", hash = "sha256:a6dafef9f71c28e5bbc682d33258050a858d68d33a32070332e36c35cdf28720", size = 316358, upload-time = "2026-01-22T09:28:31.167Z" }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/b0/52/28d808d37d272d52f3aec56f7433f9aff43bdb83224a7c715c65568ac53b/agentrun_sdk-0.0.17-py3-none-any.whl", hash = "sha256:19b1ca5e49b57000973d1f755b540cdb92ecb97084891234808a20be7e72aed6", size = 318809, upload-time = "2026-01-28T12:33:07.87Z" }, ] [[package]] @@ -260,28 +260,28 @@ wheels = [ [[package]] name = "alibabacloud-agentrun20250910" -version = "5.3.3" +version = "5.3.4" source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" } dependencies = [ { name = "alibabacloud-tea-openapi" }, { name = "darabonba-core" }, ] -sdist = { url = "https://pypi.tuna.tsinghua.edu.cn/packages/1d/41/90db47e8a912a1f98d84cdda19850ee6c73d1fef82ad3403ba87872bef0f/alibabacloud_agentrun20250910-5.3.3.tar.gz", hash = "sha256:8615c288a2812f231fe854f8cff0bfac2e18276a6758d1794a58a6bedb6ecc76", size = 86201, upload-time = "2026-01-26T17:30:09.885Z" } +sdist = { url = "https://pypi.tuna.tsinghua.edu.cn/packages/7e/97/d4d72b7a100ae686aab2c83f1388483508fa0f3ccf1259626b18d94cd74a/alibabacloud_agentrun20250910-5.3.4.tar.gz", hash = "sha256:3ea8fd0bfebc07aede3ca55a4b189f4e0be382eaf0e58df098d1ecdcc971bed1", size = 86441, upload-time = "2026-01-28T13:20:11.535Z" } wheels = [ - { url = "https://pypi.tuna.tsinghua.edu.cn/packages/90/39/1d92863f2fc4210ee376b2d95b2219ebe2d6d0c4cf0c07f60123a299121e/alibabacloud_agentrun20250910-5.3.3-py3-none-any.whl", hash = "sha256:87d1ed906f431ef479b01fb6dfe9151829f69a177c35c397a6b6878d57f5ad38", size = 281038, upload-time = "2026-01-26T17:30:08.739Z" }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/8e/bb/1dac68128e71da7974fef1c89b2af3981326ae5d0062e06a94798db9b39a/alibabacloud_agentrun20250910-5.3.4-py3-none-any.whl", hash = "sha256:7e3f708aaa94680360ec98478f705495952bb603495863bf0eadd92fe09e728c", size = 281312, upload-time = "2026-01-28T13:20:10.019Z" }, ] [[package]] name = "alibabacloud-bailian20231229" -version = "2.8.0" +version = "2.8.1" source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" } dependencies = [ { name = "alibabacloud-tea-openapi" }, { name = "darabonba-core" }, ] -sdist = { url = "https://pypi.tuna.tsinghua.edu.cn/packages/09/ec/84096b2218491574ede0ceec85bc6fbe2e9c84ae3805b9a2e3888c5849f2/alibabacloud_bailian20231229-2.8.0.tar.gz", hash = "sha256:7c1db87943ef4a3ba4f04cc5b3c5c0a1de7f74ef730852cd1f55694ea550054f", size = 68014, upload-time = "2026-01-22T03:51:09.751Z" } +sdist = { url = "https://pypi.tuna.tsinghua.edu.cn/packages/b4/65/2aee1e58bb3eec52c4892637ee15c453b0a3c7797b9b68f49bb5e9dd4e60/alibabacloud_bailian20231229-2.8.1.tar.gz", hash = "sha256:d39a79cc11b7bd0cd59054b0c8a943923f4f3330da243c83446524aab4b63ed8", size = 68212, upload-time = "2026-01-29T07:44:52.23Z" } wheels = [ - { url = "https://pypi.tuna.tsinghua.edu.cn/packages/74/ba/c68aa20f3a1fb7e222e38f37a8efb1d5f139a256ad89a4622cacf1b21756/alibabacloud_bailian20231229-2.8.0-py3-none-any.whl", hash = "sha256:8a78464ddb0de89e966a6bbd082da677099af3f44c2ae96eb327553fe9c7e1b6", size = 176573, upload-time = "2026-01-22T03:51:08.747Z" }, + { url = "https://pypi.tuna.tsinghua.edu.cn/packages/50/c6/97c771aa4305844c20549c61f79dc48ed418838fed77608240475f4d53cb/alibabacloud_bailian20231229-2.8.1-py3-none-any.whl", hash = "sha256:403678010e65412ee5f0f80c2a831bb50d5e4178f9e616c21fc2793232f25913", size = 176806, upload-time = "2026-01-29T07:44:50.762Z" }, ] [[package]] @@ -532,17 +532,6 @@ wheels = [ { url = "https://pypi.tuna.tsinghua.edu.cn/packages/26/5e/337125441af40aba86b087dee3dbe829413b6e42eac74defae2076926dbe/asana-5.2.2-py3-none-any.whl", hash = "sha256:1c8d15949a6cb9aa12363a5b7cfc6c0544cb3ae77290dd2e3255c0ec70668458", size = 203161, upload-time = "2025-09-24T21:31:02.401Z" }, ] -[[package]] -name = "aspose-slides" -version = "24.7.0" -source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" } -wheels = [ - { url = "https://pypi.tuna.tsinghua.edu.cn/packages/ea/8f/aed51648b153c854841e882b93ab01b671a6fc4e01860450bfe21e957aa7/Aspose.Slides-24.7.0-py3-none-macosx_10_14_x86_64.whl", hash = "sha256:ad1386d88539fd5ba1639ea420387d88a0ef79bea265d79d453452764cf63530", size = 82204653, upload-time = "2024-07-19T09:58:13.084Z" }, - { url = "https://pypi.tuna.tsinghua.edu.cn/packages/97/b1/6e012da70b68c3eae23daeeec3fe4c7e11fa62af84a3ece37a660c1a488c/Aspose.Slides-24.7.0-py3-none-macosx_11_0_arm64.whl", hash = "sha256:d8025282e687a1eae80be8e92250aa91f5e0725a568627597eca1477a0a4256d", size = 60041209, upload-time = "2024-07-19T09:58:19.508Z" }, - { url = "https://pypi.tuna.tsinghua.edu.cn/packages/1a/8c/48e760f52f46dad428fef6b7929b3ed4cfad89c2ec1b314ce7ad064d7314/Aspose.Slides-24.7.0-py3-none-manylinux1_x86_64.whl", hash = "sha256:5793cd178a7460a0ebcc10acd77600d8ce420f844a50cb640743aa2a7878089e", size = 95758565, upload-time = "2024-07-19T09:58:25.163Z" }, - { url = "https://pypi.tuna.tsinghua.edu.cn/packages/d8/04/c5af29852f2475c7433092c5c7701e029e1191661e8127ec72588fd720d4/Aspose.Slides-24.7.0-py3-none-win_amd64.whl", hash = "sha256:db9246fcdfcf54a1501608bd599a4b531afe753a8c23b19f53f0f48f0550712a", size = 68831159, upload-time = "2024-07-19T09:58:36.269Z" }, -] - [[package]] name = "atlassian-python-api" version = "4.0.7" @@ -6123,7 +6112,6 @@ dependencies = [ { name = "anthropic" }, { name = "arxiv" }, { name = "asana" }, - { name = "aspose-slides", marker = "platform_machine == 'x86_64' or (platform_machine == 'arm64' and sys_platform == 'darwin')" }, { name = "atlassian-python-api" }, { name = "azure-identity" }, { name = "azure-storage-file-datalake" }, @@ -6258,7 +6246,6 @@ requires-dist = [ { name = "anthropic", specifier = "==0.34.1" }, { name = "arxiv", specifier = "==2.1.3" }, { name = "asana", specifier = ">=5.2.2" }, - { name = "aspose-slides", marker = "platform_machine == 'x86_64' or (platform_machine == 'arm64' and sys_platform == 'darwin')", specifier = "==24.7.0" }, { name = "atlassian-python-api", specifier = "==4.0.7" }, { name = "azure-identity", specifier = "==1.17.1" }, { name = "azure-storage-file-datalake", specifier = "==12.16.0" },