mirror of
https://github.com/infiniflow/ragflow.git
synced 2025-12-08 20:42:30 +08:00
Feat: pipeline supports PPTX (#10167)
### What problem does this PR solve? Pipeline supports parsing PPTX naively (text only). ### Type of change - [x] New Feature (non-breaking change which adds functionality)
This commit is contained in:
@ -51,7 +51,9 @@ class ParserParam(ProcessParamBase):
|
|||||||
"word": [
|
"word": [
|
||||||
"json",
|
"json",
|
||||||
],
|
],
|
||||||
"ppt": [],
|
"slides": [
|
||||||
|
"json",
|
||||||
|
],
|
||||||
"image": [
|
"image": [
|
||||||
"text"
|
"text"
|
||||||
],
|
],
|
||||||
@ -95,7 +97,13 @@ class ParserParam(ProcessParamBase):
|
|||||||
"suffix": ["md", "markdown", "mdx"],
|
"suffix": ["md", "markdown", "mdx"],
|
||||||
"output_format": "json",
|
"output_format": "json",
|
||||||
},
|
},
|
||||||
"ppt": {},
|
"slides": {
|
||||||
|
"parse_method": "presentation",
|
||||||
|
"suffix": [
|
||||||
|
"pptx",
|
||||||
|
],
|
||||||
|
"output_format": "json",
|
||||||
|
},
|
||||||
"image": {
|
"image": {
|
||||||
"parse_method": "ocr",
|
"parse_method": "ocr",
|
||||||
"llm_id": "",
|
"llm_id": "",
|
||||||
@ -160,6 +168,11 @@ class ParserParam(ProcessParamBase):
|
|||||||
doc_output_format = doc_config.get("output_format", "")
|
doc_output_format = doc_config.get("output_format", "")
|
||||||
self.check_valid_value(doc_output_format, "Word processer document output format abnormal.", self.allowed_output_format["doc"])
|
self.check_valid_value(doc_output_format, "Word processer document output format abnormal.", self.allowed_output_format["doc"])
|
||||||
|
|
||||||
|
slides_config = self.setups.get("slides", "")
|
||||||
|
if slides_config:
|
||||||
|
slides_output_format = slides_config.get("output_format", "")
|
||||||
|
self.check_valid_value(slides_output_format, "Slides output format abnormal.", self.allowed_output_format["slides"])
|
||||||
|
|
||||||
image_config = self.setups.get("image", "")
|
image_config = self.setups.get("image", "")
|
||||||
if image_config:
|
if image_config:
|
||||||
image_parse_method = image_config.get("parse_method", "")
|
image_parse_method = image_config.get("parse_method", "")
|
||||||
@ -209,7 +222,6 @@ class Parser(ProcessBase):
|
|||||||
|
|
||||||
if conf.get("output_format") == "json":
|
if conf.get("output_format") == "json":
|
||||||
self.set_output("json", bboxes)
|
self.set_output("json", bboxes)
|
||||||
|
|
||||||
if conf.get("output_format") == "markdown":
|
if conf.get("output_format") == "markdown":
|
||||||
mkdn = ""
|
mkdn = ""
|
||||||
for b in bboxes:
|
for b in bboxes:
|
||||||
@ -253,8 +265,27 @@ class Parser(ProcessBase):
|
|||||||
if conf.get("output_format") == "json":
|
if conf.get("output_format") == "json":
|
||||||
self.set_output("json", sections)
|
self.set_output("json", sections)
|
||||||
|
|
||||||
|
def _slides(self, name, blob):
|
||||||
|
from deepdoc.parser.ppt_parser import RAGFlowPptParser as ppt_parser
|
||||||
|
|
||||||
|
self.callback(random.randint(1, 5) / 100.0, "Start to work on a PowerPoint Document")
|
||||||
|
|
||||||
|
conf = self._param.setups["slides"]
|
||||||
|
self.set_output("output_format", conf["output_format"])
|
||||||
|
|
||||||
|
ppt_parser = ppt_parser()
|
||||||
|
txts = ppt_parser(blob, 0, 100000, None)
|
||||||
|
|
||||||
|
sections = [{"text": section} for section in txts if section.strip()]
|
||||||
|
|
||||||
|
# json
|
||||||
|
assert conf.get("output_format") == "json", "have to be json for ppt"
|
||||||
|
if conf.get("output_format") == "json":
|
||||||
|
self.set_output("json", sections)
|
||||||
|
|
||||||
def _markdown(self, name, blob):
|
def _markdown(self, name, blob):
|
||||||
from functools import reduce
|
from functools import reduce
|
||||||
|
|
||||||
from rag.app.naive import Markdown as naive_markdown_parser
|
from rag.app.naive import Markdown as naive_markdown_parser
|
||||||
from rag.nlp import concat_img
|
from rag.nlp import concat_img
|
||||||
|
|
||||||
@ -322,7 +353,7 @@ class Parser(ProcessBase):
|
|||||||
|
|
||||||
else:
|
else:
|
||||||
# use VLM to describe the picture
|
# use VLM to describe the picture
|
||||||
cv_model = LLMBundle(self._canvas.get_tenant_id(), LLMType.IMAGE2TEXT, llm_name=conf["llm_id"],lang=lang)
|
cv_model = LLMBundle(self._canvas.get_tenant_id(), LLMType.IMAGE2TEXT, llm_name=conf["llm_id"], lang=lang)
|
||||||
img_binary = io.BytesIO()
|
img_binary = io.BytesIO()
|
||||||
img.save(img_binary, format="JPEG")
|
img.save(img_binary, format="JPEG")
|
||||||
img_binary.seek(0)
|
img_binary.seek(0)
|
||||||
@ -358,6 +389,7 @@ class Parser(ProcessBase):
|
|||||||
"pdf": self._pdf,
|
"pdf": self._pdf,
|
||||||
"markdown": self._markdown,
|
"markdown": self._markdown,
|
||||||
"spreadsheet": self._spreadsheet,
|
"spreadsheet": self._spreadsheet,
|
||||||
|
"slides": self._slides,
|
||||||
"word": self._word,
|
"word": self._word,
|
||||||
"text": self._text,
|
"text": self._text,
|
||||||
"image": self._image,
|
"image": self._image,
|
||||||
|
|||||||
@ -38,6 +38,13 @@
|
|||||||
],
|
],
|
||||||
"output_format": "json"
|
"output_format": "json"
|
||||||
},
|
},
|
||||||
|
"slides": {
|
||||||
|
"parse_method": "presentation",
|
||||||
|
"suffix": [
|
||||||
|
"pptx"
|
||||||
|
],
|
||||||
|
"output_format": "json"
|
||||||
|
},
|
||||||
"markdown": {
|
"markdown": {
|
||||||
"suffix": [
|
"suffix": [
|
||||||
"md",
|
"md",
|
||||||
|
|||||||
@ -114,6 +114,9 @@ class Tokenizer(ProcessBase):
|
|||||||
if from_upstream.chunks:
|
if from_upstream.chunks:
|
||||||
chunks = from_upstream.chunks
|
chunks = from_upstream.chunks
|
||||||
for i, ck in enumerate(chunks):
|
for i, ck in enumerate(chunks):
|
||||||
|
if ck.get("docnm_kwd"): # from presentation method
|
||||||
|
ck["title_tks"] = rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", ck["docnm_kwd"]))
|
||||||
|
ck["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(ck["title_tks"])
|
||||||
if ck.get("questions"):
|
if ck.get("questions"):
|
||||||
ck["question_tks"] = rag_tokenizer.tokenize("\n".join(ck["questions"]))
|
ck["question_tks"] = rag_tokenizer.tokenize("\n".join(ck["questions"]))
|
||||||
if ck.get("keywords"):
|
if ck.get("keywords"):
|
||||||
@ -135,12 +138,18 @@ class Tokenizer(ProcessBase):
|
|||||||
|
|
||||||
ck = {"text": payload}
|
ck = {"text": payload}
|
||||||
if "full_text" in self._param.search_method:
|
if "full_text" in self._param.search_method:
|
||||||
|
if ck.get("docnm_kwd"): # from presentation method
|
||||||
|
ck["title_tks"] = rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", ck["docnm_kwd"]))
|
||||||
|
ck["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(ck["title_tks"])
|
||||||
ck["content_ltks"] = rag_tokenizer.tokenize(kwargs.get(kwargs["output_format"], ""))
|
ck["content_ltks"] = rag_tokenizer.tokenize(kwargs.get(kwargs["output_format"], ""))
|
||||||
ck["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(ck["content_ltks"])
|
ck["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(ck["content_ltks"])
|
||||||
chunks = [ck]
|
chunks = [ck]
|
||||||
else:
|
else:
|
||||||
chunks = from_upstream.json_result
|
chunks = from_upstream.json_result
|
||||||
for i, ck in enumerate(chunks):
|
for i, ck in enumerate(chunks):
|
||||||
|
if ck.get("docnm_kwd"): # from presentation method
|
||||||
|
ck["title_tks"] = rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", ck["docnm_kwd"]))
|
||||||
|
ck["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(ck["title_tks"])
|
||||||
ck["content_ltks"] = rag_tokenizer.tokenize(ck["text"])
|
ck["content_ltks"] = rag_tokenizer.tokenize(ck["text"])
|
||||||
ck["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(ck["content_ltks"])
|
ck["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(ck["content_ltks"])
|
||||||
if i % 100 == 99:
|
if i % 100 == 99:
|
||||||
|
|||||||
@ -945,7 +945,6 @@ class GiteeEmbed(SILICONFLOWEmbed):
|
|||||||
base_url = "https://ai.gitee.com/v1/embeddings"
|
base_url = "https://ai.gitee.com/v1/embeddings"
|
||||||
super().__init__(key, model_name, base_url)
|
super().__init__(key, model_name, base_url)
|
||||||
|
|
||||||
|
|
||||||
class DeepInfraEmbed(OpenAIEmbed):
|
class DeepInfraEmbed(OpenAIEmbed):
|
||||||
_FACTORY_NAME = "DeepInfra"
|
_FACTORY_NAME = "DeepInfra"
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user