mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-02-03 17:15:08 +08:00
feat: Implement legacy .ppt parsing via Tika (alternative to Aspose) (#12932)
## What problem does this PR solve? This PR implements parsing support for legacy PowerPoint files (`.ppt`, 97-2003 format). Currently, parsing these files fails because `python-pptx` **natively lacks support** for the legacy OLE2 binary format. ## **Context:** I originally using `aspose-slides` for this purpose. However, since `aspose-slides` is **no longer a project dependency**, I implemented a fallback mechanism using the existing `tika-server` to ensure compatibility and stability. ## **Key Changes:** - **Fallback Logic**: Modified `rag/app/presentation.py` to catch `python-pptx` failures and automatically fall back to Tika parsing. - **No New Dependencies**: Utilizes the `tika` service that is already part of the RAGFlow stack. - **Note**: Since Tika focuses on text extraction, this implementation extracts text content but does not generate slide thumbnails . ## 🧪 Test / Verification Results ### 1. Before (The Issue) I have verified the fix using a legacy `.ppt` file (`math(1).ppt`, ~8MB). <img width="963" height="970" alt="image" src="https://github.com/user-attachments/assets/468c4ba8-f90b-4d7b-b969-9c5f5e42c474" /> ### 2. After (The Fix) With this PR, the system detects the failure in python-pptx and successfully falls back to Tika. The text is extracted correctly. <img width="1467" height="1121" alt="image" src="https://github.com/user-attachments/assets/fa0fed3b-b923-4c86-ba2c-24b3ce6ee7a6" /> **Type of change** - [x] New Feature (non-breaking change which adds functionality) Signed-off-by: evilhero <2278596667@qq.com> Co-authored-by: Yingfeng <yingfeng.zhang@gmail.com>
This commit is contained in:
@ -15,6 +15,7 @@
|
|||||||
#
|
#
|
||||||
|
|
||||||
import copy
|
import copy
|
||||||
|
import logging
|
||||||
import re
|
import re
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
@ -125,7 +126,7 @@ class PlainPdf(PlainParser):
|
|||||||
|
|
||||||
def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, parser_config=None, **kwargs):
|
def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, parser_config=None, **kwargs):
|
||||||
"""
|
"""
|
||||||
The supported file formats are pdf, pptx.
|
The supported file formats are pdf, ppt, pptx.
|
||||||
Every page will be treated as a chunk. And the thumbnail of every page will be stored.
|
Every page will be treated as a chunk. And the thumbnail of every page will be stored.
|
||||||
PPT file will be parsed by using this method automatically, setting-up for every PPT file is not necessary.
|
PPT file will be parsed by using this method automatically, setting-up for every PPT file is not necessary.
|
||||||
"""
|
"""
|
||||||
@ -136,17 +137,58 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
|
|||||||
doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
|
doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
|
||||||
res = []
|
res = []
|
||||||
if re.search(r"\.pptx?$", filename, re.IGNORECASE):
|
if re.search(r"\.pptx?$", filename, re.IGNORECASE):
|
||||||
ppt_parser = RAGFlowPptParser()
|
try:
|
||||||
for pn, txt in enumerate(ppt_parser(filename if not binary else binary, from_page, 1000000, callback)):
|
ppt_parser = RAGFlowPptParser()
|
||||||
d = copy.deepcopy(doc)
|
for pn, txt in enumerate(ppt_parser(filename if not binary else binary, from_page, 1000000, callback)):
|
||||||
pn += from_page
|
d = copy.deepcopy(doc)
|
||||||
d["doc_type_kwd"] = "image"
|
pn += from_page
|
||||||
d["page_num_int"] = [pn + 1]
|
d["doc_type_kwd"] = "image"
|
||||||
d["top_int"] = [0]
|
d["page_num_int"] = [pn + 1]
|
||||||
d["position_int"] = [(pn + 1, 0, 0, 0, 0)]
|
d["top_int"] = [0]
|
||||||
tokenize(d, txt, eng)
|
d["position_int"] = [(pn + 1, 0, 0, 0, 0)]
|
||||||
res.append(d)
|
tokenize(d, txt, eng)
|
||||||
return res
|
res.append(d)
|
||||||
|
return res
|
||||||
|
except Exception as e:
|
||||||
|
logging.warning(f"python-pptx parsing failed for {filename}: {e}, trying tika as fallback")
|
||||||
|
if callback:
|
||||||
|
callback(0.1, "python-pptx failed, trying tika as fallback")
|
||||||
|
|
||||||
|
try:
|
||||||
|
from tika import parser as tika_parser
|
||||||
|
except Exception as tika_error:
|
||||||
|
error_msg = f"tika not available: {tika_error}. Unsupported .ppt/.pptx parsing."
|
||||||
|
if callback:
|
||||||
|
callback(0.8, error_msg)
|
||||||
|
logging.warning(f"{error_msg} for {filename}.")
|
||||||
|
raise NotImplementedError(error_msg)
|
||||||
|
|
||||||
|
binary_data = binary if binary else open(filename, 'rb').read()
|
||||||
|
doc_parsed = tika_parser.from_buffer(BytesIO(binary_data))
|
||||||
|
|
||||||
|
if doc_parsed.get("content", None) is not None:
|
||||||
|
sections = doc_parsed["content"].split("\n")
|
||||||
|
sections = [s for s in sections if s.strip()]
|
||||||
|
|
||||||
|
for pn, txt in enumerate(sections):
|
||||||
|
d = copy.deepcopy(doc)
|
||||||
|
pn += from_page
|
||||||
|
d["doc_type_kwd"] = "text"
|
||||||
|
d["page_num_int"] = [pn + 1]
|
||||||
|
d["top_int"] = [0]
|
||||||
|
d["position_int"] = [(pn + 1, 0, 0, 0, 0)]
|
||||||
|
tokenize(d, txt, eng)
|
||||||
|
res.append(d)
|
||||||
|
|
||||||
|
if callback:
|
||||||
|
callback(0.8, "Finish parsing with tika.")
|
||||||
|
return res
|
||||||
|
else:
|
||||||
|
error_msg = f"tika.parser got empty content from {filename}."
|
||||||
|
if callback:
|
||||||
|
callback(0.8, error_msg)
|
||||||
|
logging.warning(error_msg)
|
||||||
|
raise NotImplementedError(error_msg)
|
||||||
elif re.search(r"\.pdf$", filename, re.IGNORECASE):
|
elif re.search(r"\.pdf$", filename, re.IGNORECASE):
|
||||||
layout_recognizer, parser_model_name = normalize_layout_recognizer(parser_config.get("layout_recognize", "DeepDOC"))
|
layout_recognizer, parser_model_name = normalize_layout_recognizer(parser_config.get("layout_recognize", "DeepDOC"))
|
||||||
|
|
||||||
@ -192,7 +234,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
|
|||||||
res.append(d)
|
res.append(d)
|
||||||
return res
|
return res
|
||||||
|
|
||||||
raise NotImplementedError("file type not supported yet(pptx, pdf supported)")
|
raise NotImplementedError("file type not supported yet(ppt, pptx, pdf supported)")
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|||||||
Reference in New Issue
Block a user