mirror of
https://github.com/infiniflow/ragflow.git
synced 2025-12-08 20:42:30 +08:00
Feat: pipeline supports PPTX (#10167)
### What problem does this PR solve? Pipeline supports parsing PPTX naively (text only). ### Type of change - [x] New Feature (non-breaking change which adds functionality)
This commit is contained in:
@ -114,6 +114,9 @@ class Tokenizer(ProcessBase):
|
||||
if from_upstream.chunks:
|
||||
chunks = from_upstream.chunks
|
||||
for i, ck in enumerate(chunks):
|
||||
if ck.get("docnm_kwd"): # from presentation method
|
||||
ck["title_tks"] = rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", ck["docnm_kwd"]))
|
||||
ck["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(ck["title_tks"])
|
||||
if ck.get("questions"):
|
||||
ck["question_tks"] = rag_tokenizer.tokenize("\n".join(ck["questions"]))
|
||||
if ck.get("keywords"):
|
||||
@ -135,12 +138,18 @@ class Tokenizer(ProcessBase):
|
||||
|
||||
ck = {"text": payload}
|
||||
if "full_text" in self._param.search_method:
|
||||
if ck.get("docnm_kwd"): # from presentation method
|
||||
ck["title_tks"] = rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", ck["docnm_kwd"]))
|
||||
ck["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(ck["title_tks"])
|
||||
ck["content_ltks"] = rag_tokenizer.tokenize(kwargs.get(kwargs["output_format"], ""))
|
||||
ck["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(ck["content_ltks"])
|
||||
chunks = [ck]
|
||||
else:
|
||||
chunks = from_upstream.json_result
|
||||
for i, ck in enumerate(chunks):
|
||||
if ck.get("docnm_kwd"): # from presentation method
|
||||
ck["title_tks"] = rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", ck["docnm_kwd"]))
|
||||
ck["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(ck["title_tks"])
|
||||
ck["content_ltks"] = rag_tokenizer.tokenize(ck["text"])
|
||||
ck["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(ck["content_ltks"])
|
||||
if i % 100 == 99:
|
||||
|
||||
Reference in New Issue
Block a user