Fix: excel default optimization. (#11519)

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
This commit is contained in:
Kevin Hu
2025-11-25 19:54:20 +08:00
committed by GitHub
parent 7c20c964b4
commit 74e0b58d89
2 changed files with 7 additions and 29 deletions

View File

@ -628,16 +628,8 @@ def naive_merge(sections: str | list, chunk_token_num=128, delimiter="\n。
tk_nums.append(num_tokens_from_string(text))
return cks
dels = get_delimiters(delimiter)
for sec, pos in sections:
if num_tokens_from_string(sec) < chunk_token_num:
add_chunk("\n"+sec, pos)
continue
split_sec = re.split(r"(%s)" % dels, sec, flags=re.DOTALL)
for sub_sec in split_sec:
if re.match(f"^{dels}$", sub_sec):
continue
add_chunk("\n"+sub_sec, pos)
add_chunk("\n"+sec, pos)
return cks
@ -700,26 +692,18 @@ def naive_merge_with_images(texts, images, chunk_token_num=128, delimiter="\n。
tk_nums.append(num_tokens_from_string(text_seg))
return cks, result_images
dels = get_delimiters(delimiter)
for text, image in zip(texts, images):
# if text is tuple, unpack it
if isinstance(text, tuple):
text_str = text[0]
text_pos = text[1] if len(text) > 1 else ""
split_sec = re.split(r"(%s)" % dels, text_str)
for sub_sec in split_sec:
if re.match(f"^{dels}$", sub_sec):
continue
add_chunk("\n"+sub_sec, image, text_pos)
add_chunk("\n"+text_str, image, text_pos)
else:
split_sec = re.split(r"(%s)" % dels, text)
for sub_sec in split_sec:
if re.match(f"^{dels}$", sub_sec):
continue
add_chunk("\n"+sub_sec, image)
add_chunk("\n"+text, image)
return cks, result_images
def docx_question_level(p, bull=-1):
txt = re.sub(r"\u3000", " ", p.text).strip()
if p.style.name.startswith('Heading'):
@ -808,15 +792,8 @@ def naive_merge_docx(sections, chunk_token_num=128, delimiter="\n。"):
tk_nums.append(num_tokens_from_string(text_seg))
return cks, images
dels = get_delimiters(delimiter)
pattern = r"(%s)" % dels
for sec, image in sections:
split_sec = re.split(pattern, sec)
for sub_sec in split_sec:
if not sub_sec or re.match(f"^{dels}$", sub_sec):
continue
add_chunk("\n" + sub_sec, image, "")
add_chunk("\n" + sec, image, "")
return cks, images
@ -844,6 +821,7 @@ def get_delimiters(delimiters: str):
return dels_pattern
class Node:
def __init__(self, level, depth=-1, texts=None):
self.level = level