mirror of
https://github.com/infiniflow/ragflow.git
synced 2025-12-08 20:42:30 +08:00
Fix: excel default optimization. (#11519)
### Type of change - [x] Bug Fix (non-breaking change which fixes an issue)
This commit is contained in:
@ -754,9 +754,9 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
|||||||
excel_parser = ExcelParser()
|
excel_parser = ExcelParser()
|
||||||
if parser_config.get("html4excel"):
|
if parser_config.get("html4excel"):
|
||||||
sections = [(_, "") for _ in excel_parser.html(binary, 12) if _]
|
sections = [(_, "") for _ in excel_parser.html(binary, 12) if _]
|
||||||
|
parser_config["chunk_token_num"] = 0
|
||||||
else:
|
else:
|
||||||
sections = [(_, "") for _ in excel_parser(binary) if _]
|
sections = [(_, "") for _ in excel_parser(binary) if _]
|
||||||
parser_config["chunk_token_num"] = 12800
|
|
||||||
|
|
||||||
elif re.search(r"\.(txt|py|js|java|c|cpp|h|php|go|ts|sh|cs|kt|sql)$", filename, re.IGNORECASE):
|
elif re.search(r"\.(txt|py|js|java|c|cpp|h|php|go|ts|sh|cs|kt|sql)$", filename, re.IGNORECASE):
|
||||||
callback(0.1, "Start to parse.")
|
callback(0.1, "Start to parse.")
|
||||||
|
|||||||
@ -628,16 +628,8 @@ def naive_merge(sections: str | list, chunk_token_num=128, delimiter="\n。;
|
|||||||
tk_nums.append(num_tokens_from_string(text))
|
tk_nums.append(num_tokens_from_string(text))
|
||||||
return cks
|
return cks
|
||||||
|
|
||||||
dels = get_delimiters(delimiter)
|
|
||||||
for sec, pos in sections:
|
for sec, pos in sections:
|
||||||
if num_tokens_from_string(sec) < chunk_token_num:
|
|
||||||
add_chunk("\n"+sec, pos)
|
add_chunk("\n"+sec, pos)
|
||||||
continue
|
|
||||||
split_sec = re.split(r"(%s)" % dels, sec, flags=re.DOTALL)
|
|
||||||
for sub_sec in split_sec:
|
|
||||||
if re.match(f"^{dels}$", sub_sec):
|
|
||||||
continue
|
|
||||||
add_chunk("\n"+sub_sec, pos)
|
|
||||||
|
|
||||||
return cks
|
return cks
|
||||||
|
|
||||||
@ -700,26 +692,18 @@ def naive_merge_with_images(texts, images, chunk_token_num=128, delimiter="\n。
|
|||||||
tk_nums.append(num_tokens_from_string(text_seg))
|
tk_nums.append(num_tokens_from_string(text_seg))
|
||||||
return cks, result_images
|
return cks, result_images
|
||||||
|
|
||||||
dels = get_delimiters(delimiter)
|
|
||||||
for text, image in zip(texts, images):
|
for text, image in zip(texts, images):
|
||||||
# if text is tuple, unpack it
|
# if text is tuple, unpack it
|
||||||
if isinstance(text, tuple):
|
if isinstance(text, tuple):
|
||||||
text_str = text[0]
|
text_str = text[0]
|
||||||
text_pos = text[1] if len(text) > 1 else ""
|
text_pos = text[1] if len(text) > 1 else ""
|
||||||
split_sec = re.split(r"(%s)" % dels, text_str)
|
add_chunk("\n"+text_str, image, text_pos)
|
||||||
for sub_sec in split_sec:
|
|
||||||
if re.match(f"^{dels}$", sub_sec):
|
|
||||||
continue
|
|
||||||
add_chunk("\n"+sub_sec, image, text_pos)
|
|
||||||
else:
|
else:
|
||||||
split_sec = re.split(r"(%s)" % dels, text)
|
add_chunk("\n"+text, image)
|
||||||
for sub_sec in split_sec:
|
|
||||||
if re.match(f"^{dels}$", sub_sec):
|
|
||||||
continue
|
|
||||||
add_chunk("\n"+sub_sec, image)
|
|
||||||
|
|
||||||
return cks, result_images
|
return cks, result_images
|
||||||
|
|
||||||
|
|
||||||
def docx_question_level(p, bull=-1):
|
def docx_question_level(p, bull=-1):
|
||||||
txt = re.sub(r"\u3000", " ", p.text).strip()
|
txt = re.sub(r"\u3000", " ", p.text).strip()
|
||||||
if p.style.name.startswith('Heading'):
|
if p.style.name.startswith('Heading'):
|
||||||
@ -808,15 +792,8 @@ def naive_merge_docx(sections, chunk_token_num=128, delimiter="\n。;!?"):
|
|||||||
tk_nums.append(num_tokens_from_string(text_seg))
|
tk_nums.append(num_tokens_from_string(text_seg))
|
||||||
return cks, images
|
return cks, images
|
||||||
|
|
||||||
dels = get_delimiters(delimiter)
|
|
||||||
pattern = r"(%s)" % dels
|
|
||||||
|
|
||||||
for sec, image in sections:
|
for sec, image in sections:
|
||||||
split_sec = re.split(pattern, sec)
|
add_chunk("\n" + sec, image, "")
|
||||||
for sub_sec in split_sec:
|
|
||||||
if not sub_sec or re.match(f"^{dels}$", sub_sec):
|
|
||||||
continue
|
|
||||||
add_chunk("\n" + sub_sec, image, "")
|
|
||||||
|
|
||||||
return cks, images
|
return cks, images
|
||||||
|
|
||||||
@ -844,6 +821,7 @@ def get_delimiters(delimiters: str):
|
|||||||
|
|
||||||
return dels_pattern
|
return dels_pattern
|
||||||
|
|
||||||
|
|
||||||
class Node:
|
class Node:
|
||||||
def __init__(self, level, depth=-1, texts=None):
|
def __init__(self, level, depth=-1, texts=None):
|
||||||
self.level = level
|
self.level = level
|
||||||
|
|||||||
Reference in New Issue
Block a user