From 1a2d69edc49d66c9e30191830fa4dbd5728975e5 Mon Sep 17 00:00:00 2001 From: eviaaaaa <2278596667@qq.com> Date: Mon, 2 Feb 2026 13:40:51 +0800 Subject: [PATCH] feat: Implement legacy .ppt parsing via Tika (alternative to Aspose) (#12932) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## What problem does this PR solve? This PR implements parsing support for legacy PowerPoint files (`.ppt`, 97-2003 format). Currently, parsing these files fails because `python-pptx` **natively lacks support** for the legacy OLE2 binary format. ## **Context:** I originally using `aspose-slides` for this purpose. However, since `aspose-slides` is **no longer a project dependency**, I implemented a fallback mechanism using the existing `tika-server` to ensure compatibility and stability. ## **Key Changes:** - **Fallback Logic**: Modified `rag/app/presentation.py` to catch `python-pptx` failures and automatically fall back to Tika parsing. - **No New Dependencies**: Utilizes the `tika` service that is already part of the RAGFlow stack. - **Note**: Since Tika focuses on text extraction, this implementation extracts text content but does not generate slide thumbnails . ## 🧪 Test / Verification Results ### 1. Before (The Issue) I have verified the fix using a legacy `.ppt` file (`math(1).ppt`, ~8MB). image ### 2. After (The Fix) With this PR, the system detects the failure in python-pptx and successfully falls back to Tika. The text is extracted correctly. image **Type of change** - [x] New Feature (non-breaking change which adds functionality) Signed-off-by: evilhero <2278596667@qq.com> Co-authored-by: Yingfeng --- rag/app/presentation.py | 68 +++++++++++++++++++++++++++++++++-------- 1 file changed, 55 insertions(+), 13 deletions(-) diff --git a/rag/app/presentation.py b/rag/app/presentation.py index 2746be918..5c8431af1 100644 --- a/rag/app/presentation.py +++ b/rag/app/presentation.py @@ -15,6 +15,7 @@ # import copy +import logging import re from collections import defaultdict from io import BytesIO @@ -125,7 +126,7 @@ class PlainPdf(PlainParser): def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, parser_config=None, **kwargs): """ - The supported file formats are pdf, pptx. + The supported file formats are pdf, ppt, pptx. Every page will be treated as a chunk. And the thumbnail of every page will be stored. PPT file will be parsed by using this method automatically, setting-up for every PPT file is not necessary. """ @@ -136,17 +137,58 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"]) res = [] if re.search(r"\.pptx?$", filename, re.IGNORECASE): - ppt_parser = RAGFlowPptParser() - for pn, txt in enumerate(ppt_parser(filename if not binary else binary, from_page, 1000000, callback)): - d = copy.deepcopy(doc) - pn += from_page - d["doc_type_kwd"] = "image" - d["page_num_int"] = [pn + 1] - d["top_int"] = [0] - d["position_int"] = [(pn + 1, 0, 0, 0, 0)] - tokenize(d, txt, eng) - res.append(d) - return res + try: + ppt_parser = RAGFlowPptParser() + for pn, txt in enumerate(ppt_parser(filename if not binary else binary, from_page, 1000000, callback)): + d = copy.deepcopy(doc) + pn += from_page + d["doc_type_kwd"] = "image" + d["page_num_int"] = [pn + 1] + d["top_int"] = [0] + d["position_int"] = [(pn + 1, 0, 0, 0, 0)] + tokenize(d, txt, eng) + res.append(d) + return res + except Exception as e: + logging.warning(f"python-pptx parsing failed for {filename}: {e}, trying tika as fallback") + if callback: + callback(0.1, "python-pptx failed, trying tika as fallback") + + try: + from tika import parser as tika_parser + except Exception as tika_error: + error_msg = f"tika not available: {tika_error}. Unsupported .ppt/.pptx parsing." + if callback: + callback(0.8, error_msg) + logging.warning(f"{error_msg} for {filename}.") + raise NotImplementedError(error_msg) + + binary_data = binary if binary else open(filename, 'rb').read() + doc_parsed = tika_parser.from_buffer(BytesIO(binary_data)) + + if doc_parsed.get("content", None) is not None: + sections = doc_parsed["content"].split("\n") + sections = [s for s in sections if s.strip()] + + for pn, txt in enumerate(sections): + d = copy.deepcopy(doc) + pn += from_page + d["doc_type_kwd"] = "text" + d["page_num_int"] = [pn + 1] + d["top_int"] = [0] + d["position_int"] = [(pn + 1, 0, 0, 0, 0)] + tokenize(d, txt, eng) + res.append(d) + + if callback: + callback(0.8, "Finish parsing with tika.") + return res + else: + error_msg = f"tika.parser got empty content from {filename}." + if callback: + callback(0.8, error_msg) + logging.warning(error_msg) + raise NotImplementedError(error_msg) elif re.search(r"\.pdf$", filename, re.IGNORECASE): layout_recognizer, parser_model_name = normalize_layout_recognizer(parser_config.get("layout_recognize", "DeepDOC")) @@ -192,7 +234,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca res.append(d) return res - raise NotImplementedError("file type not supported yet(pptx, pdf supported)") + raise NotImplementedError("file type not supported yet(ppt, pptx, pdf supported)") if __name__ == "__main__":