mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-02-02 00:25:06 +08:00
Feat: support context window for docx (#12455)
### What problem does this PR solve? Feat: support context window for docx #12303 Done: - [x] naive.py - [x] one.py TODO: - [ ] book.py - [ ] manual.py Fix: incorrect image position Fix: incorrect chunk type tag ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue) - [x] New Feature (non-breaking change which adds functionality)
This commit is contained in:
@ -87,10 +87,18 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
||||
callback(0.1, "Start to parse.")
|
||||
doc_parser = naive.Docx()
|
||||
# TODO: table of contents need to be removed
|
||||
sections, tbls = doc_parser(
|
||||
main_sections = doc_parser(
|
||||
filename, binary=binary, from_page=from_page, to_page=to_page)
|
||||
|
||||
sections = []
|
||||
tbls = []
|
||||
for text, image, html in main_sections:
|
||||
sections.append((text, image))
|
||||
tbls.append(((None, html), ""))
|
||||
|
||||
remove_contents_table(sections, eng=is_english(
|
||||
random_choices([t for t, _ in sections], k=200)))
|
||||
|
||||
tbls = vision_figure_parser_docx_wrapper(sections=sections, tbls=tbls, callback=callback, **kwargs)
|
||||
# tbls = [((None, lns), None) for lns in tbls]
|
||||
sections = [(item[0], item[1] if item[1] is not None else "") for item in sections if
|
||||
|
||||
Reference in New Issue
Block a user