mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-02-02 16:45:08 +08:00
Feat: support context window for docx (#12455)
### What problem does this PR solve? Feat: support context window for docx #12303 Done: - [x] naive.py - [x] one.py TODO: - [ ] book.py - [ ] manual.py Fix: incorrect image position Fix: incorrect chunk type tag ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue) - [x] New Feature (non-breaking change which adds functionality)
This commit is contained in:
@ -22,7 +22,7 @@ from deepdoc.parser.utils import get_text
|
||||
from rag.app import naive
|
||||
from rag.nlp import rag_tokenizer, tokenize
|
||||
from deepdoc.parser import PdfParser, ExcelParser, HtmlParser
|
||||
from deepdoc.parser.figure_parser import vision_figure_parser_docx_wrapper
|
||||
from deepdoc.parser.figure_parser import vision_figure_parser_docx_wrapper_naive
|
||||
from rag.app.naive import by_plaintext, PARSERS
|
||||
from common.parser_config_utils import normalize_layout_recognizer
|
||||
|
||||
@ -76,11 +76,26 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
||||
|
||||
if re.search(r"\.docx$", filename, re.IGNORECASE):
|
||||
callback(0.1, "Start to parse.")
|
||||
sections, tbls = naive.Docx()(filename, binary)
|
||||
tbls = vision_figure_parser_docx_wrapper(sections=sections, tbls=tbls, callback=callback, **kwargs)
|
||||
sections = [s for s, _ in sections if s]
|
||||
for (_, html), _ in tbls:
|
||||
sections.append(html)
|
||||
sections = naive.Docx()(filename, binary)
|
||||
cks = []
|
||||
image_idxs = []
|
||||
|
||||
for text, image, table in sections:
|
||||
if table is not None:
|
||||
text = (text or "") + str(table)
|
||||
ck_type = "table"
|
||||
else:
|
||||
ck_type = "image" if image is not None else "text"
|
||||
|
||||
if ck_type == "image":
|
||||
image_idxs.append(len(cks))
|
||||
|
||||
cks.append({"text": text, "image": image, "ck_type": ck_type})
|
||||
|
||||
vision_figure_parser_docx_wrapper_naive(cks, image_idxs, callback, **kwargs)
|
||||
for ck in cks:
|
||||
print(ck)
|
||||
sections = [ck["text"] for ck in cks if ck.get("text")]
|
||||
callback(0.8, "Finish parsing.")
|
||||
|
||||
elif re.search(r"\.pdf$", filename, re.IGNORECASE):
|
||||
|
||||
Reference in New Issue
Block a user