From 74e0b58d89b1ecc1a745a64782726ab1a2aec22f Mon Sep 17 00:00:00 2001
From: Kevin Hu <kevinhu.sh@gmail.com>
Date: Tue, 25 Nov 2025 19:54:20 +0800
Subject: [PATCH] Fix: excel default optimization. (#11519)

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
---
 rag/app/naive.py    |  2 +-
 rag/nlp/__init__.py | 34 ++++++----------------------------
 2 files changed, 7 insertions(+), 29 deletions(-)

diff --git a/rag/app/naive.py b/rag/app/naive.py
index 836b3fd9e..0496c7507 100644
--- a/rag/app/naive.py
+++ b/rag/app/naive.py
@@ -754,9 +754,9 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
             excel_parser = ExcelParser()
             if parser_config.get("html4excel"):
                 sections = [(_, "") for _ in excel_parser.html(binary, 12) if _]
+                parser_config["chunk_token_num"] = 0
             else:
                 sections = [(_, "") for _ in excel_parser(binary) if _]
-            parser_config["chunk_token_num"] = 12800
 
     elif re.search(r"\.(txt|py|js|java|c|cpp|h|php|go|ts|sh|cs|kt|sql)$", filename, re.IGNORECASE):
         callback(0.1, "Start to parse.")
diff --git a/rag/nlp/__init__.py b/rag/nlp/__init__.py
index add454ade..0624309ee 100644
--- a/rag/nlp/__init__.py
+++ b/rag/nlp/__init__.py
@@ -628,16 +628,8 @@ def naive_merge(sections: str | list, chunk_token_num=128, delimiter="\n。；
                 tk_nums.append(num_tokens_from_string(text))
         return cks
 
-    dels = get_delimiters(delimiter)
     for sec, pos in sections:
-        if num_tokens_from_string(sec) < chunk_token_num:
-            add_chunk("\n"+sec, pos)
-            continue
-        split_sec = re.split(r"(%s)" % dels, sec, flags=re.DOTALL)
-        for sub_sec in split_sec:
-            if re.match(f"^{dels}$", sub_sec):
-                continue
-            add_chunk("\n"+sub_sec, pos)
+        add_chunk("\n"+sec, pos)
 
     return cks
 
@@ -700,26 +692,18 @@ def naive_merge_with_images(texts, images, chunk_token_num=128, delimiter="\n。
                 tk_nums.append(num_tokens_from_string(text_seg))
         return cks, result_images
 
-    dels = get_delimiters(delimiter)
     for text, image in zip(texts, images):
         # if text is tuple, unpack it
         if isinstance(text, tuple):
             text_str = text[0]
             text_pos = text[1] if len(text) > 1 else ""
-            split_sec = re.split(r"(%s)" % dels, text_str)
-            for sub_sec in split_sec:
-                if re.match(f"^{dels}$", sub_sec):
-                    continue
-                add_chunk("\n"+sub_sec, image, text_pos)
+            add_chunk("\n"+text_str, image, text_pos)
         else:
-            split_sec = re.split(r"(%s)" % dels, text)
-            for sub_sec in split_sec:
-                if re.match(f"^{dels}$", sub_sec):
-                    continue
-                add_chunk("\n"+sub_sec, image)
+            add_chunk("\n"+text, image)
 
     return cks, result_images
 
+
 def docx_question_level(p, bull=-1):
     txt = re.sub(r"\u3000", " ", p.text).strip()
     if p.style.name.startswith('Heading'):
@@ -808,15 +792,8 @@ def naive_merge_docx(sections, chunk_token_num=128, delimiter="\n。；！？"):
                 tk_nums.append(num_tokens_from_string(text_seg))
         return cks, images
 
-    dels = get_delimiters(delimiter)
-    pattern = r"(%s)" % dels
-
     for sec, image in sections:
-        split_sec = re.split(pattern, sec)
-        for sub_sec in split_sec:
-            if not sub_sec or re.match(f"^{dels}$", sub_sec):
-                continue
-            add_chunk("\n" + sub_sec, image, "")
+        add_chunk("\n" + sec, image, "")
 
     return cks, images
 
@@ -844,6 +821,7 @@ def get_delimiters(delimiters: str):
 
     return dels_pattern
 
+
 class Node:
     def __init__(self, level, depth=-1, texts=None):
         self.level = level