Feat: pipeline supports PPTX (#10167)

### What problem does this PR solve?

Pipeline supports parsing PPTX naively (text only).

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
This commit is contained in:
Yongteng Lei
2025-09-19 12:14:35 +08:00
committed by GitHub
parent 4fae40f66a
commit 5dfdbcce3a
4 changed files with 53 additions and 6 deletions

View File

@ -51,7 +51,9 @@ class ParserParam(ProcessParamBase):
"word": [ "word": [
"json", "json",
], ],
"ppt": [], "slides": [
"json",
],
"image": [ "image": [
"text" "text"
], ],
@ -95,7 +97,13 @@ class ParserParam(ProcessParamBase):
"suffix": ["md", "markdown", "mdx"], "suffix": ["md", "markdown", "mdx"],
"output_format": "json", "output_format": "json",
}, },
"ppt": {}, "slides": {
"parse_method": "presentation",
"suffix": [
"pptx",
],
"output_format": "json",
},
"image": { "image": {
"parse_method": "ocr", "parse_method": "ocr",
"llm_id": "", "llm_id": "",
@ -160,6 +168,11 @@ class ParserParam(ProcessParamBase):
doc_output_format = doc_config.get("output_format", "") doc_output_format = doc_config.get("output_format", "")
self.check_valid_value(doc_output_format, "Word processer document output format abnormal.", self.allowed_output_format["doc"]) self.check_valid_value(doc_output_format, "Word processer document output format abnormal.", self.allowed_output_format["doc"])
slides_config = self.setups.get("slides", "")
if slides_config:
slides_output_format = slides_config.get("output_format", "")
self.check_valid_value(slides_output_format, "Slides output format abnormal.", self.allowed_output_format["slides"])
image_config = self.setups.get("image", "") image_config = self.setups.get("image", "")
if image_config: if image_config:
image_parse_method = image_config.get("parse_method", "") image_parse_method = image_config.get("parse_method", "")
@ -209,7 +222,6 @@ class Parser(ProcessBase):
if conf.get("output_format") == "json": if conf.get("output_format") == "json":
self.set_output("json", bboxes) self.set_output("json", bboxes)
if conf.get("output_format") == "markdown": if conf.get("output_format") == "markdown":
mkdn = "" mkdn = ""
for b in bboxes: for b in bboxes:
@ -253,8 +265,27 @@ class Parser(ProcessBase):
if conf.get("output_format") == "json": if conf.get("output_format") == "json":
self.set_output("json", sections) self.set_output("json", sections)
def _slides(self, name, blob):
from deepdoc.parser.ppt_parser import RAGFlowPptParser as ppt_parser
self.callback(random.randint(1, 5) / 100.0, "Start to work on a PowerPoint Document")
conf = self._param.setups["slides"]
self.set_output("output_format", conf["output_format"])
ppt_parser = ppt_parser()
txts = ppt_parser(blob, 0, 100000, None)
sections = [{"text": section} for section in txts if section.strip()]
# json
assert conf.get("output_format") == "json", "have to be json for ppt"
if conf.get("output_format") == "json":
self.set_output("json", sections)
def _markdown(self, name, blob): def _markdown(self, name, blob):
from functools import reduce from functools import reduce
from rag.app.naive import Markdown as naive_markdown_parser from rag.app.naive import Markdown as naive_markdown_parser
from rag.nlp import concat_img from rag.nlp import concat_img
@ -322,7 +353,7 @@ class Parser(ProcessBase):
else: else:
# use VLM to describe the picture # use VLM to describe the picture
cv_model = LLMBundle(self._canvas.get_tenant_id(), LLMType.IMAGE2TEXT, llm_name=conf["llm_id"],lang=lang) cv_model = LLMBundle(self._canvas.get_tenant_id(), LLMType.IMAGE2TEXT, llm_name=conf["llm_id"], lang=lang)
img_binary = io.BytesIO() img_binary = io.BytesIO()
img.save(img_binary, format="JPEG") img.save(img_binary, format="JPEG")
img_binary.seek(0) img_binary.seek(0)
@ -358,6 +389,7 @@ class Parser(ProcessBase):
"pdf": self._pdf, "pdf": self._pdf,
"markdown": self._markdown, "markdown": self._markdown,
"spreadsheet": self._spreadsheet, "spreadsheet": self._spreadsheet,
"slides": self._slides,
"word": self._word, "word": self._word,
"text": self._text, "text": self._text,
"image": self._image, "image": self._image,

View File

@ -38,6 +38,13 @@
], ],
"output_format": "json" "output_format": "json"
}, },
"slides": {
"parse_method": "presentation",
"suffix": [
"pptx"
],
"output_format": "json"
},
"markdown": { "markdown": {
"suffix": [ "suffix": [
"md", "md",

View File

@ -114,6 +114,9 @@ class Tokenizer(ProcessBase):
if from_upstream.chunks: if from_upstream.chunks:
chunks = from_upstream.chunks chunks = from_upstream.chunks
for i, ck in enumerate(chunks): for i, ck in enumerate(chunks):
if ck.get("docnm_kwd"): # from presentation method
ck["title_tks"] = rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", ck["docnm_kwd"]))
ck["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(ck["title_tks"])
if ck.get("questions"): if ck.get("questions"):
ck["question_tks"] = rag_tokenizer.tokenize("\n".join(ck["questions"])) ck["question_tks"] = rag_tokenizer.tokenize("\n".join(ck["questions"]))
if ck.get("keywords"): if ck.get("keywords"):
@ -135,12 +138,18 @@ class Tokenizer(ProcessBase):
ck = {"text": payload} ck = {"text": payload}
if "full_text" in self._param.search_method: if "full_text" in self._param.search_method:
if ck.get("docnm_kwd"): # from presentation method
ck["title_tks"] = rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", ck["docnm_kwd"]))
ck["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(ck["title_tks"])
ck["content_ltks"] = rag_tokenizer.tokenize(kwargs.get(kwargs["output_format"], "")) ck["content_ltks"] = rag_tokenizer.tokenize(kwargs.get(kwargs["output_format"], ""))
ck["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(ck["content_ltks"]) ck["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(ck["content_ltks"])
chunks = [ck] chunks = [ck]
else: else:
chunks = from_upstream.json_result chunks = from_upstream.json_result
for i, ck in enumerate(chunks): for i, ck in enumerate(chunks):
if ck.get("docnm_kwd"): # from presentation method
ck["title_tks"] = rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", ck["docnm_kwd"]))
ck["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(ck["title_tks"])
ck["content_ltks"] = rag_tokenizer.tokenize(ck["text"]) ck["content_ltks"] = rag_tokenizer.tokenize(ck["text"])
ck["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(ck["content_ltks"]) ck["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(ck["content_ltks"])
if i % 100 == 99: if i % 100 == 99:

View File

@ -945,7 +945,6 @@ class GiteeEmbed(SILICONFLOWEmbed):
base_url = "https://ai.gitee.com/v1/embeddings" base_url = "https://ai.gitee.com/v1/embeddings"
super().__init__(key, model_name, base_url) super().__init__(key, model_name, base_url)
class DeepInfraEmbed(OpenAIEmbed): class DeepInfraEmbed(OpenAIEmbed):
_FACTORY_NAME = "DeepInfra" _FACTORY_NAME = "DeepInfra"