Fix: excel default optimization. (#11519)

### Type of change - [x] Bug Fix (non-breaking change which fixes an issue)
2026-01-04 03:25:30 +08:00 · 2025-11-25 19:54:20 +08:00
parent 7c20c964b4
commit 74e0b58d89
2 changed files with 7 additions and 29 deletions
--- a/rag/app/naive.py
+++ b/rag/app/naive.py
@ -754,9 +754,9 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
            excel_parser = ExcelParser()
            if parser_config.get("html4excel"):
                sections = [(_, "") for _ in excel_parser.html(binary, 12) if _]
+                parser_config["chunk_token_num"] = 0
            else:
                sections = [(_, "") for _ in excel_parser(binary) if _]
-            parser_config["chunk_token_num"] = 12800

    elif re.search(r"\.(txt|py|js|java|c|cpp|h|php|go|ts|sh|cs|kt|sql)$", filename, re.IGNORECASE):
        callback(0.1, "Start to parse.")
--- a/rag/nlp/init.py
+++ b/rag/nlp/init.py
@ -628,16 +628,8 @@ def naive_merge(sections: str | list, chunk_token_num=128, delimiter="\n。；
                tk_nums.append(num_tokens_from_string(text))
        return cks

-    dels = get_delimiters(delimiter)
    for sec, pos in sections:
-        if num_tokens_from_string(sec) < chunk_token_num:
-            add_chunk("\n"+sec, pos)
-            continue
-        split_sec = re.split(r"(%s)" % dels, sec, flags=re.DOTALL)
-        for sub_sec in split_sec:
-            if re.match(f"^{dels}$", sub_sec):
-                continue
-            add_chunk("\n"+sub_sec, pos)
+        add_chunk("\n"+sec, pos)

    return cks

@ -700,26 +692,18 @@ def naive_merge_with_images(texts, images, chunk_token_num=128, delimiter="\n。
                tk_nums.append(num_tokens_from_string(text_seg))
        return cks, result_images

-    dels = get_delimiters(delimiter)
    for text, image in zip(texts, images):
        # if text is tuple, unpack it
        if isinstance(text, tuple):
            text_str = text[0]
            text_pos = text[1] if len(text) > 1 else ""
-            split_sec = re.split(r"(%s)" % dels, text_str)
-            for sub_sec in split_sec:
-                if re.match(f"^{dels}$", sub_sec):
-                    continue
-                add_chunk("\n"+sub_sec, image, text_pos)
+            add_chunk("\n"+text_str, image, text_pos)
        else:
-            split_sec = re.split(r"(%s)" % dels, text)
-            for sub_sec in split_sec:
-                if re.match(f"^{dels}$", sub_sec):
-                    continue
-                add_chunk("\n"+sub_sec, image)
+            add_chunk("\n"+text, image)

    return cks, result_images

+
 def docx_question_level(p, bull=-1):
    txt = re.sub(r"\u3000", " ", p.text).strip()
    if p.style.name.startswith('Heading'):
@ -808,15 +792,8 @@ def naive_merge_docx(sections, chunk_token_num=128, delimiter="\n。；！？"):
                tk_nums.append(num_tokens_from_string(text_seg))
        return cks, images

-    dels = get_delimiters(delimiter)
-    pattern = r"(%s)" % dels
-
    for sec, image in sections:
-        split_sec = re.split(pattern, sec)
-        for sub_sec in split_sec:
-            if not sub_sec or re.match(f"^{dels}$", sub_sec):
-                continue
-            add_chunk("\n" + sub_sec, image, "")
+        add_chunk("\n" + sec, image, "")

    return cks, images

@ -844,6 +821,7 @@ def get_delimiters(delimiters: str):

    return dels_pattern

+
 class Node:
    def __init__(self, level, depth=-1, texts=None):
        self.level = level