mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-02-03 17:15:08 +08:00
Fix: docx parser output consistent (#12965)
### What problem does this PR solve? Fix: docx parser output consistent > File "/home/bxy/ragflow/rag/flow/parser/parser.py", line 506, in _word > sections, tbls = docx_parser(name, binary=blob) > ^^^^^^^^^^^^^^ > ValueError: too many values to unpack (expected 2) > ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue)
This commit is contained in:
@ -896,7 +896,7 @@ Success:
|
|||||||
"vector_similarity_weight": 0.3
|
"vector_similarity_weight": 0.3
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"total": 1
|
"total_datasets": 1
|
||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|||||||
@ -503,7 +503,13 @@ class Parser(ProcessBase):
|
|||||||
docx_parser = Docx()
|
docx_parser = Docx()
|
||||||
|
|
||||||
if conf.get("output_format") == "json":
|
if conf.get("output_format") == "json":
|
||||||
sections, tbls = docx_parser(name, binary=blob)
|
main_sections = docx_parser(name, binary=blob)
|
||||||
|
sections = []
|
||||||
|
tbls = []
|
||||||
|
for text, image, html in main_sections:
|
||||||
|
sections.append((text, image))
|
||||||
|
tbls.append(((None, html), ""))
|
||||||
|
|
||||||
sections = [{"text": section[0], "image": section[1]} for section in sections if section]
|
sections = [{"text": section[0], "image": section[1]} for section in sections if section]
|
||||||
sections.extend([{"text": tb, "image": None, "doc_type_kwd": "table"} for ((_, tb), _) in tbls])
|
sections.extend([{"text": tb, "image": None, "doc_type_kwd": "table"} for ((_, tb), _) in tbls])
|
||||||
|
|
||||||
|
|||||||
@ -1168,6 +1168,8 @@ def naive_merge_with_images(texts, images, chunk_token_num=128, delimiter="\n。
|
|||||||
cks, result_images, tk_nums = [], [], []
|
cks, result_images, tk_nums = [], [], []
|
||||||
for text, image in zip(texts, images):
|
for text, image in zip(texts, images):
|
||||||
text_str = text[0] if isinstance(text, tuple) else text
|
text_str = text[0] if isinstance(text, tuple) else text
|
||||||
|
if text_str is None:
|
||||||
|
text_str = ""
|
||||||
text_pos = text[1] if isinstance(text, tuple) and len(text) > 1 else ""
|
text_pos = text[1] if isinstance(text, tuple) and len(text) > 1 else ""
|
||||||
split_sec = re.split(r"(%s)" % custom_pattern, text_str)
|
split_sec = re.split(r"(%s)" % custom_pattern, text_str)
|
||||||
for sub_sec in split_sec:
|
for sub_sec in split_sec:
|
||||||
@ -1187,11 +1189,11 @@ def naive_merge_with_images(texts, images, chunk_token_num=128, delimiter="\n。
|
|||||||
for text, image in zip(texts, images):
|
for text, image in zip(texts, images):
|
||||||
# if text is tuple, unpack it
|
# if text is tuple, unpack it
|
||||||
if isinstance(text, tuple):
|
if isinstance(text, tuple):
|
||||||
text_str = text[0]
|
text_str = text[0] if text[0] is not None else ""
|
||||||
text_pos = text[1] if len(text) > 1 else ""
|
text_pos = text[1] if len(text) > 1 else ""
|
||||||
add_chunk("\n" + text_str, image, text_pos)
|
add_chunk("\n" + text_str, image, text_pos)
|
||||||
else:
|
else:
|
||||||
add_chunk("\n" + text, image)
|
add_chunk("\n" + (text or ""), image)
|
||||||
|
|
||||||
return cks, result_images
|
return cks, result_images
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user