From 1a2d69edc49d66c9e30191830fa4dbd5728975e5 Mon Sep 17 00:00:00 2001
From: eviaaaaa <2278596667@qq.com>
Date: Mon, 2 Feb 2026 13:40:51 +0800
Subject: [PATCH] feat: Implement legacy .ppt parsing via Tika (alternative to
Aspose) (#12932)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
## What problem does this PR solve?
This PR implements parsing support for legacy PowerPoint files (`.ppt`,
97-2003 format).
Currently, parsing these files fails because `python-pptx` **natively
lacks support** for the legacy OLE2 binary format.
## **Context:**
I originally using `aspose-slides` for this purpose. However, since
`aspose-slides` is **no longer a project dependency**, I implemented a
fallback mechanism using the existing `tika-server` to ensure
compatibility and stability.
## **Key Changes:**
- **Fallback Logic**: Modified `rag/app/presentation.py` to catch
`python-pptx` failures and automatically fall back to Tika parsing.
- **No New Dependencies**: Utilizes the `tika` service that is already
part of the RAGFlow stack.
- **Note**: Since Tika focuses on text extraction, this implementation
extracts text content but does not generate slide thumbnails .
## 🧪 Test / Verification Results
### 1. Before (The Issue)
I have verified the fix using a legacy `.ppt` file (`math(1).ppt`,
~8MB).
### 2. After (The Fix)
With this PR, the system detects the failure in python-pptx and
successfully falls back to Tika. The text is extracted correctly.
**Type of change**
- [x] New Feature (non-breaking change which adds functionality)
Signed-off-by: evilhero <2278596667@qq.com>
Co-authored-by: Yingfeng
---
rag/app/presentation.py | 68 +++++++++++++++++++++++++++++++++--------
1 file changed, 55 insertions(+), 13 deletions(-)
diff --git a/rag/app/presentation.py b/rag/app/presentation.py
index 2746be918..5c8431af1 100644
--- a/rag/app/presentation.py
+++ b/rag/app/presentation.py
@@ -15,6 +15,7 @@
#
import copy
+import logging
import re
from collections import defaultdict
from io import BytesIO
@@ -125,7 +126,7 @@ class PlainPdf(PlainParser):
def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, parser_config=None, **kwargs):
"""
- The supported file formats are pdf, pptx.
+ The supported file formats are pdf, ppt, pptx.
Every page will be treated as a chunk. And the thumbnail of every page will be stored.
PPT file will be parsed by using this method automatically, setting-up for every PPT file is not necessary.
"""
@@ -136,17 +137,58 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
res = []
if re.search(r"\.pptx?$", filename, re.IGNORECASE):
- ppt_parser = RAGFlowPptParser()
- for pn, txt in enumerate(ppt_parser(filename if not binary else binary, from_page, 1000000, callback)):
- d = copy.deepcopy(doc)
- pn += from_page
- d["doc_type_kwd"] = "image"
- d["page_num_int"] = [pn + 1]
- d["top_int"] = [0]
- d["position_int"] = [(pn + 1, 0, 0, 0, 0)]
- tokenize(d, txt, eng)
- res.append(d)
- return res
+ try:
+ ppt_parser = RAGFlowPptParser()
+ for pn, txt in enumerate(ppt_parser(filename if not binary else binary, from_page, 1000000, callback)):
+ d = copy.deepcopy(doc)
+ pn += from_page
+ d["doc_type_kwd"] = "image"
+ d["page_num_int"] = [pn + 1]
+ d["top_int"] = [0]
+ d["position_int"] = [(pn + 1, 0, 0, 0, 0)]
+ tokenize(d, txt, eng)
+ res.append(d)
+ return res
+ except Exception as e:
+ logging.warning(f"python-pptx parsing failed for {filename}: {e}, trying tika as fallback")
+ if callback:
+ callback(0.1, "python-pptx failed, trying tika as fallback")
+
+ try:
+ from tika import parser as tika_parser
+ except Exception as tika_error:
+ error_msg = f"tika not available: {tika_error}. Unsupported .ppt/.pptx parsing."
+ if callback:
+ callback(0.8, error_msg)
+ logging.warning(f"{error_msg} for {filename}.")
+ raise NotImplementedError(error_msg)
+
+ binary_data = binary if binary else open(filename, 'rb').read()
+ doc_parsed = tika_parser.from_buffer(BytesIO(binary_data))
+
+ if doc_parsed.get("content", None) is not None:
+ sections = doc_parsed["content"].split("\n")
+ sections = [s for s in sections if s.strip()]
+
+ for pn, txt in enumerate(sections):
+ d = copy.deepcopy(doc)
+ pn += from_page
+ d["doc_type_kwd"] = "text"
+ d["page_num_int"] = [pn + 1]
+ d["top_int"] = [0]
+ d["position_int"] = [(pn + 1, 0, 0, 0, 0)]
+ tokenize(d, txt, eng)
+ res.append(d)
+
+ if callback:
+ callback(0.8, "Finish parsing with tika.")
+ return res
+ else:
+ error_msg = f"tika.parser got empty content from {filename}."
+ if callback:
+ callback(0.8, error_msg)
+ logging.warning(error_msg)
+ raise NotImplementedError(error_msg)
elif re.search(r"\.pdf$", filename, re.IGNORECASE):
layout_recognizer, parser_model_name = normalize_layout_recognizer(parser_config.get("layout_recognize", "DeepDOC"))
@@ -192,7 +234,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
res.append(d)
return res
- raise NotImplementedError("file type not supported yet(pptx, pdf supported)")
+ raise NotImplementedError("file type not supported yet(ppt, pptx, pdf supported)")
if __name__ == "__main__":