mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-02-01 16:15:07 +08:00
Refa: remove ppt image. (#12909)
### What problem does this PR solve? remove `aspose` ### Type of change - [x] Refactoring
This commit is contained in:
@ -28,7 +28,6 @@ ENV DEBIAN_FRONTEND=noninteractive
|
|||||||
# Setup apt
|
# Setup apt
|
||||||
# Python package and implicit dependencies:
|
# Python package and implicit dependencies:
|
||||||
# opencv-python: libglib2.0-0 libglx-mesa0 libgl1
|
# opencv-python: libglib2.0-0 libglx-mesa0 libgl1
|
||||||
# aspose-slides: pkg-config libicu-dev libgdiplus libssl1.1_1.1.1f-1ubuntu2_amd64.deb
|
|
||||||
# python-pptx: default-jdk tika-server-standard-3.2.3.jar
|
# python-pptx: default-jdk tika-server-standard-3.2.3.jar
|
||||||
# selenium: libatk-bridge2.0-0 chrome-linux64-121-0-6167-85
|
# selenium: libatk-bridge2.0-0 chrome-linux64-121-0-6167-85
|
||||||
# Building C extensions: libpython3-dev libgtk-4-1 libnss3 xdg-utils libgbm-dev
|
# Building C extensions: libpython3-dev libgtk-4-1 libnss3 xdg-utils libgbm-dev
|
||||||
@ -128,8 +127,6 @@ RUN --mount=type=bind,from=infiniflow/ragflow_deps:latest,source=/chromedriver-l
|
|||||||
mv chromedriver /usr/local/bin/ && \
|
mv chromedriver /usr/local/bin/ && \
|
||||||
rm -f /usr/bin/google-chrome
|
rm -f /usr/bin/google-chrome
|
||||||
|
|
||||||
# https://forum.aspose.com/t/aspose-slides-for-net-no-usable-version-of-libssl-found-with-linux-server/271344/13
|
|
||||||
# aspose-slides on linux/arm64 is unavailable
|
|
||||||
RUN --mount=type=bind,from=infiniflow/ragflow_deps:latest,source=/,target=/deps \
|
RUN --mount=type=bind,from=infiniflow/ragflow_deps:latest,source=/,target=/deps \
|
||||||
if [ "$(uname -m)" = "x86_64" ]; then \
|
if [ "$(uname -m)" = "x86_64" ]; then \
|
||||||
dpkg -i /deps/libssl1.1_1.1.1f-1ubuntu2_amd64.deb; \
|
dpkg -i /deps/libssl1.1_1.1.1f-1ubuntu2_amd64.deb; \
|
||||||
|
|||||||
@ -84,28 +84,6 @@ def thumbnail_img(filename, blob):
|
|||||||
buffered = BytesIO()
|
buffered = BytesIO()
|
||||||
image.save(buffered, format="png")
|
image.save(buffered, format="png")
|
||||||
return buffered.getvalue()
|
return buffered.getvalue()
|
||||||
|
|
||||||
elif re.match(r".*\.(ppt|pptx)$", filename):
|
|
||||||
import aspose.pydrawing as drawing
|
|
||||||
import aspose.slides as slides
|
|
||||||
|
|
||||||
try:
|
|
||||||
with slides.Presentation(BytesIO(blob)) as presentation:
|
|
||||||
buffered = BytesIO()
|
|
||||||
scale = 0.03
|
|
||||||
img = None
|
|
||||||
for _ in range(10):
|
|
||||||
# https://reference.aspose.com/slides/python-net/aspose.slides/slide/get_thumbnail/#float-float
|
|
||||||
presentation.slides[0].get_thumbnail(scale, scale).save(buffered, drawing.imaging.ImageFormat.png)
|
|
||||||
img = buffered.getvalue()
|
|
||||||
if len(img) >= 64000:
|
|
||||||
scale = scale / 2.0
|
|
||||||
buffered = BytesIO()
|
|
||||||
else:
|
|
||||||
break
|
|
||||||
return img
|
|
||||||
except Exception:
|
|
||||||
pass
|
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -11,7 +11,6 @@ dependencies = [
|
|||||||
"akshare>=1.15.78,<2.0.0",
|
"akshare>=1.15.78,<2.0.0",
|
||||||
"anthropic==0.34.1",
|
"anthropic==0.34.1",
|
||||||
"arxiv==2.1.3",
|
"arxiv==2.1.3",
|
||||||
"aspose-slides==24.7.0; platform_machine == 'x86_64' or (sys_platform == 'darwin' and platform_machine == 'arm64')",
|
|
||||||
"atlassian-python-api==4.0.7",
|
"atlassian-python-api==4.0.7",
|
||||||
"azure-identity==1.17.1",
|
"azure-identity==1.17.1",
|
||||||
"azure-storage-file-datalake==12.16.0",
|
"azure-storage-file-datalake==12.16.0",
|
||||||
|
|||||||
@ -23,36 +23,13 @@ from PIL import Image
|
|||||||
from PyPDF2 import PdfReader as pdf2_read
|
from PyPDF2 import PdfReader as pdf2_read
|
||||||
|
|
||||||
from deepdoc.parser import PdfParser, PptParser, PlainParser
|
from deepdoc.parser import PdfParser, PptParser, PlainParser
|
||||||
|
from deepdoc.parser.ppt_parser import RAGFlowPptParser
|
||||||
from rag.app.naive import by_plaintext, PARSERS
|
from rag.app.naive import by_plaintext, PARSERS
|
||||||
from common.parser_config_utils import normalize_layout_recognizer
|
from common.parser_config_utils import normalize_layout_recognizer
|
||||||
from rag.nlp import rag_tokenizer
|
from rag.nlp import rag_tokenizer
|
||||||
from rag.nlp import tokenize, is_english
|
from rag.nlp import tokenize, is_english
|
||||||
|
|
||||||
|
|
||||||
class Ppt(PptParser):
|
|
||||||
def __call__(self, fnm, from_page, to_page, callback=None):
|
|
||||||
txts = super().__call__(fnm, from_page, to_page)
|
|
||||||
|
|
||||||
callback(0.5, "Text extraction finished.")
|
|
||||||
import aspose.slides as slides
|
|
||||||
import aspose.pydrawing as drawing
|
|
||||||
|
|
||||||
imgs = []
|
|
||||||
with slides.Presentation(BytesIO(fnm)) as presentation:
|
|
||||||
for i, slide in enumerate(presentation.slides[from_page:to_page]):
|
|
||||||
try:
|
|
||||||
with BytesIO() as buffered:
|
|
||||||
slide.get_thumbnail(0.1, 0.1).save(buffered, drawing.imaging.ImageFormat.jpeg)
|
|
||||||
buffered.seek(0)
|
|
||||||
imgs.append(Image.open(buffered).copy())
|
|
||||||
except RuntimeError as e:
|
|
||||||
raise RuntimeError(f"ppt parse error at page {i + 1}, original error: {str(e)}") from e
|
|
||||||
assert len(imgs) == len(txts), "Slides text and image do not match: {} vs. {}".format(len(imgs), len(txts))
|
|
||||||
callback(0.9, "Image extraction finished")
|
|
||||||
self.is_english = is_english(txts)
|
|
||||||
return [(txts[i], imgs[i]) for i in range(len(txts))]
|
|
||||||
|
|
||||||
|
|
||||||
class Pdf(PdfParser):
|
class Pdf(PdfParser):
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
@ -159,15 +136,14 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
|
|||||||
doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
|
doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
|
||||||
res = []
|
res = []
|
||||||
if re.search(r"\.pptx?$", filename, re.IGNORECASE):
|
if re.search(r"\.pptx?$", filename, re.IGNORECASE):
|
||||||
ppt_parser = Ppt()
|
ppt_parser = RAGFlowPptParser()
|
||||||
for pn, (txt, img) in enumerate(ppt_parser(filename if not binary else binary, from_page, 1000000, callback)):
|
for pn, txt in enumerate(ppt_parser(filename if not binary else binary, from_page, 1000000, callback)):
|
||||||
d = copy.deepcopy(doc)
|
d = copy.deepcopy(doc)
|
||||||
pn += from_page
|
pn += from_page
|
||||||
d["image"] = img
|
|
||||||
d["doc_type_kwd"] = "image"
|
d["doc_type_kwd"] = "image"
|
||||||
d["page_num_int"] = [pn + 1]
|
d["page_num_int"] = [pn + 1]
|
||||||
d["top_int"] = [0]
|
d["top_int"] = [0]
|
||||||
d["position_int"] = [(pn + 1, 0, img.size[0], 0, img.size[1])]
|
d["position_int"] = [(pn + 1, 0, 0, 0, 0)]
|
||||||
tokenize(d, txt, eng)
|
tokenize(d, txt, eng)
|
||||||
res.append(d)
|
res.append(d)
|
||||||
return res
|
return res
|
||||||
|
|||||||
31
uv.lock
generated
31
uv.lock
generated
@ -35,7 +35,7 @@ wheels = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "agentrun-sdk"
|
name = "agentrun-sdk"
|
||||||
version = "0.0.16"
|
version = "0.0.17"
|
||||||
source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" }
|
source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" }
|
||||||
dependencies = [
|
dependencies = [
|
||||||
{ name = "agentrun-mem0ai" },
|
{ name = "agentrun-mem0ai" },
|
||||||
@ -51,9 +51,9 @@ dependencies = [
|
|||||||
{ name = "python-dotenv" },
|
{ name = "python-dotenv" },
|
||||||
{ name = "typing-extensions" },
|
{ name = "typing-extensions" },
|
||||||
]
|
]
|
||||||
sdist = { url = "https://pypi.tuna.tsinghua.edu.cn/packages/34/99/e4651b65a6e52a6547e17c97efc96660f6476ffa49b178253ef62d9982bd/agentrun_sdk-0.0.16.tar.gz", hash = "sha256:73900293aaa6be4d6c7304870b662e302c86f817ebe280ed34c53ea2fe054cc9", size = 232813, upload-time = "2026-01-22T09:28:32.558Z" }
|
sdist = { url = "https://pypi.tuna.tsinghua.edu.cn/packages/a7/26/77f2e2e9ee8d2caec776a8a4e5bc0f2d2e5b550152fec61721684f29e819/agentrun_sdk-0.0.17.tar.gz", hash = "sha256:cb0362487d0cbe0a11b21f4e12071e4dfcf9666a13e42c1bccee2d8948411ef9", size = 235373, upload-time = "2026-01-28T12:33:09.501Z" }
|
||||||
wheels = [
|
wheels = [
|
||||||
{ url = "https://pypi.tuna.tsinghua.edu.cn/packages/5d/94/6ad9dd91195bfdc2e423b30184ce7cbb20e388b4ac1c6251c9e1ca17ea74/agentrun_sdk-0.0.16-py3-none-any.whl", hash = "sha256:a6dafef9f71c28e5bbc682d33258050a858d68d33a32070332e36c35cdf28720", size = 316358, upload-time = "2026-01-22T09:28:31.167Z" },
|
{ url = "https://pypi.tuna.tsinghua.edu.cn/packages/b0/52/28d808d37d272d52f3aec56f7433f9aff43bdb83224a7c715c65568ac53b/agentrun_sdk-0.0.17-py3-none-any.whl", hash = "sha256:19b1ca5e49b57000973d1f755b540cdb92ecb97084891234808a20be7e72aed6", size = 318809, upload-time = "2026-01-28T12:33:07.87Z" },
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
@ -260,28 +260,28 @@ wheels = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "alibabacloud-agentrun20250910"
|
name = "alibabacloud-agentrun20250910"
|
||||||
version = "5.3.3"
|
version = "5.3.4"
|
||||||
source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" }
|
source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" }
|
||||||
dependencies = [
|
dependencies = [
|
||||||
{ name = "alibabacloud-tea-openapi" },
|
{ name = "alibabacloud-tea-openapi" },
|
||||||
{ name = "darabonba-core" },
|
{ name = "darabonba-core" },
|
||||||
]
|
]
|
||||||
sdist = { url = "https://pypi.tuna.tsinghua.edu.cn/packages/1d/41/90db47e8a912a1f98d84cdda19850ee6c73d1fef82ad3403ba87872bef0f/alibabacloud_agentrun20250910-5.3.3.tar.gz", hash = "sha256:8615c288a2812f231fe854f8cff0bfac2e18276a6758d1794a58a6bedb6ecc76", size = 86201, upload-time = "2026-01-26T17:30:09.885Z" }
|
sdist = { url = "https://pypi.tuna.tsinghua.edu.cn/packages/7e/97/d4d72b7a100ae686aab2c83f1388483508fa0f3ccf1259626b18d94cd74a/alibabacloud_agentrun20250910-5.3.4.tar.gz", hash = "sha256:3ea8fd0bfebc07aede3ca55a4b189f4e0be382eaf0e58df098d1ecdcc971bed1", size = 86441, upload-time = "2026-01-28T13:20:11.535Z" }
|
||||||
wheels = [
|
wheels = [
|
||||||
{ url = "https://pypi.tuna.tsinghua.edu.cn/packages/90/39/1d92863f2fc4210ee376b2d95b2219ebe2d6d0c4cf0c07f60123a299121e/alibabacloud_agentrun20250910-5.3.3-py3-none-any.whl", hash = "sha256:87d1ed906f431ef479b01fb6dfe9151829f69a177c35c397a6b6878d57f5ad38", size = 281038, upload-time = "2026-01-26T17:30:08.739Z" },
|
{ url = "https://pypi.tuna.tsinghua.edu.cn/packages/8e/bb/1dac68128e71da7974fef1c89b2af3981326ae5d0062e06a94798db9b39a/alibabacloud_agentrun20250910-5.3.4-py3-none-any.whl", hash = "sha256:7e3f708aaa94680360ec98478f705495952bb603495863bf0eadd92fe09e728c", size = 281312, upload-time = "2026-01-28T13:20:10.019Z" },
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "alibabacloud-bailian20231229"
|
name = "alibabacloud-bailian20231229"
|
||||||
version = "2.8.0"
|
version = "2.8.1"
|
||||||
source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" }
|
source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" }
|
||||||
dependencies = [
|
dependencies = [
|
||||||
{ name = "alibabacloud-tea-openapi" },
|
{ name = "alibabacloud-tea-openapi" },
|
||||||
{ name = "darabonba-core" },
|
{ name = "darabonba-core" },
|
||||||
]
|
]
|
||||||
sdist = { url = "https://pypi.tuna.tsinghua.edu.cn/packages/09/ec/84096b2218491574ede0ceec85bc6fbe2e9c84ae3805b9a2e3888c5849f2/alibabacloud_bailian20231229-2.8.0.tar.gz", hash = "sha256:7c1db87943ef4a3ba4f04cc5b3c5c0a1de7f74ef730852cd1f55694ea550054f", size = 68014, upload-time = "2026-01-22T03:51:09.751Z" }
|
sdist = { url = "https://pypi.tuna.tsinghua.edu.cn/packages/b4/65/2aee1e58bb3eec52c4892637ee15c453b0a3c7797b9b68f49bb5e9dd4e60/alibabacloud_bailian20231229-2.8.1.tar.gz", hash = "sha256:d39a79cc11b7bd0cd59054b0c8a943923f4f3330da243c83446524aab4b63ed8", size = 68212, upload-time = "2026-01-29T07:44:52.23Z" }
|
||||||
wheels = [
|
wheels = [
|
||||||
{ url = "https://pypi.tuna.tsinghua.edu.cn/packages/74/ba/c68aa20f3a1fb7e222e38f37a8efb1d5f139a256ad89a4622cacf1b21756/alibabacloud_bailian20231229-2.8.0-py3-none-any.whl", hash = "sha256:8a78464ddb0de89e966a6bbd082da677099af3f44c2ae96eb327553fe9c7e1b6", size = 176573, upload-time = "2026-01-22T03:51:08.747Z" },
|
{ url = "https://pypi.tuna.tsinghua.edu.cn/packages/50/c6/97c771aa4305844c20549c61f79dc48ed418838fed77608240475f4d53cb/alibabacloud_bailian20231229-2.8.1-py3-none-any.whl", hash = "sha256:403678010e65412ee5f0f80c2a831bb50d5e4178f9e616c21fc2793232f25913", size = 176806, upload-time = "2026-01-29T07:44:50.762Z" },
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
@ -532,17 +532,6 @@ wheels = [
|
|||||||
{ url = "https://pypi.tuna.tsinghua.edu.cn/packages/26/5e/337125441af40aba86b087dee3dbe829413b6e42eac74defae2076926dbe/asana-5.2.2-py3-none-any.whl", hash = "sha256:1c8d15949a6cb9aa12363a5b7cfc6c0544cb3ae77290dd2e3255c0ec70668458", size = 203161, upload-time = "2025-09-24T21:31:02.401Z" },
|
{ url = "https://pypi.tuna.tsinghua.edu.cn/packages/26/5e/337125441af40aba86b087dee3dbe829413b6e42eac74defae2076926dbe/asana-5.2.2-py3-none-any.whl", hash = "sha256:1c8d15949a6cb9aa12363a5b7cfc6c0544cb3ae77290dd2e3255c0ec70668458", size = 203161, upload-time = "2025-09-24T21:31:02.401Z" },
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "aspose-slides"
|
|
||||||
version = "24.7.0"
|
|
||||||
source = { registry = "https://pypi.tuna.tsinghua.edu.cn/simple" }
|
|
||||||
wheels = [
|
|
||||||
{ url = "https://pypi.tuna.tsinghua.edu.cn/packages/ea/8f/aed51648b153c854841e882b93ab01b671a6fc4e01860450bfe21e957aa7/Aspose.Slides-24.7.0-py3-none-macosx_10_14_x86_64.whl", hash = "sha256:ad1386d88539fd5ba1639ea420387d88a0ef79bea265d79d453452764cf63530", size = 82204653, upload-time = "2024-07-19T09:58:13.084Z" },
|
|
||||||
{ url = "https://pypi.tuna.tsinghua.edu.cn/packages/97/b1/6e012da70b68c3eae23daeeec3fe4c7e11fa62af84a3ece37a660c1a488c/Aspose.Slides-24.7.0-py3-none-macosx_11_0_arm64.whl", hash = "sha256:d8025282e687a1eae80be8e92250aa91f5e0725a568627597eca1477a0a4256d", size = 60041209, upload-time = "2024-07-19T09:58:19.508Z" },
|
|
||||||
{ url = "https://pypi.tuna.tsinghua.edu.cn/packages/1a/8c/48e760f52f46dad428fef6b7929b3ed4cfad89c2ec1b314ce7ad064d7314/Aspose.Slides-24.7.0-py3-none-manylinux1_x86_64.whl", hash = "sha256:5793cd178a7460a0ebcc10acd77600d8ce420f844a50cb640743aa2a7878089e", size = 95758565, upload-time = "2024-07-19T09:58:25.163Z" },
|
|
||||||
{ url = "https://pypi.tuna.tsinghua.edu.cn/packages/d8/04/c5af29852f2475c7433092c5c7701e029e1191661e8127ec72588fd720d4/Aspose.Slides-24.7.0-py3-none-win_amd64.whl", hash = "sha256:db9246fcdfcf54a1501608bd599a4b531afe753a8c23b19f53f0f48f0550712a", size = 68831159, upload-time = "2024-07-19T09:58:36.269Z" },
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "atlassian-python-api"
|
name = "atlassian-python-api"
|
||||||
version = "4.0.7"
|
version = "4.0.7"
|
||||||
@ -6123,7 +6112,6 @@ dependencies = [
|
|||||||
{ name = "anthropic" },
|
{ name = "anthropic" },
|
||||||
{ name = "arxiv" },
|
{ name = "arxiv" },
|
||||||
{ name = "asana" },
|
{ name = "asana" },
|
||||||
{ name = "aspose-slides", marker = "platform_machine == 'x86_64' or (platform_machine == 'arm64' and sys_platform == 'darwin')" },
|
|
||||||
{ name = "atlassian-python-api" },
|
{ name = "atlassian-python-api" },
|
||||||
{ name = "azure-identity" },
|
{ name = "azure-identity" },
|
||||||
{ name = "azure-storage-file-datalake" },
|
{ name = "azure-storage-file-datalake" },
|
||||||
@ -6258,7 +6246,6 @@ requires-dist = [
|
|||||||
{ name = "anthropic", specifier = "==0.34.1" },
|
{ name = "anthropic", specifier = "==0.34.1" },
|
||||||
{ name = "arxiv", specifier = "==2.1.3" },
|
{ name = "arxiv", specifier = "==2.1.3" },
|
||||||
{ name = "asana", specifier = ">=5.2.2" },
|
{ name = "asana", specifier = ">=5.2.2" },
|
||||||
{ name = "aspose-slides", marker = "platform_machine == 'x86_64' or (platform_machine == 'arm64' and sys_platform == 'darwin')", specifier = "==24.7.0" },
|
|
||||||
{ name = "atlassian-python-api", specifier = "==4.0.7" },
|
{ name = "atlassian-python-api", specifier = "==4.0.7" },
|
||||||
{ name = "azure-identity", specifier = "==1.17.1" },
|
{ name = "azure-identity", specifier = "==1.17.1" },
|
||||||
{ name = "azure-storage-file-datalake", specifier = "==12.16.0" },
|
{ name = "azure-storage-file-datalake", specifier = "==12.16.0" },
|
||||||
|
|||||||
Reference in New Issue
Block a user