From 74e0b58d89b1ecc1a745a64782726ab1a2aec22f Mon Sep 17 00:00:00 2001 From: Kevin Hu Date: Tue, 25 Nov 2025 19:54:20 +0800 Subject: [PATCH] Fix: excel default optimization. (#11519) ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue) --- rag/app/naive.py | 2 +- rag/nlp/__init__.py | 34 ++++++---------------------------- 2 files changed, 7 insertions(+), 29 deletions(-) diff --git a/rag/app/naive.py b/rag/app/naive.py index 836b3fd9e..0496c7507 100644 --- a/rag/app/naive.py +++ b/rag/app/naive.py @@ -754,9 +754,9 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, excel_parser = ExcelParser() if parser_config.get("html4excel"): sections = [(_, "") for _ in excel_parser.html(binary, 12) if _] + parser_config["chunk_token_num"] = 0 else: sections = [(_, "") for _ in excel_parser(binary) if _] - parser_config["chunk_token_num"] = 12800 elif re.search(r"\.(txt|py|js|java|c|cpp|h|php|go|ts|sh|cs|kt|sql)$", filename, re.IGNORECASE): callback(0.1, "Start to parse.") diff --git a/rag/nlp/__init__.py b/rag/nlp/__init__.py index add454ade..0624309ee 100644 --- a/rag/nlp/__init__.py +++ b/rag/nlp/__init__.py @@ -628,16 +628,8 @@ def naive_merge(sections: str | list, chunk_token_num=128, delimiter="\n。; tk_nums.append(num_tokens_from_string(text)) return cks - dels = get_delimiters(delimiter) for sec, pos in sections: - if num_tokens_from_string(sec) < chunk_token_num: - add_chunk("\n"+sec, pos) - continue - split_sec = re.split(r"(%s)" % dels, sec, flags=re.DOTALL) - for sub_sec in split_sec: - if re.match(f"^{dels}$", sub_sec): - continue - add_chunk("\n"+sub_sec, pos) + add_chunk("\n"+sec, pos) return cks @@ -700,26 +692,18 @@ def naive_merge_with_images(texts, images, chunk_token_num=128, delimiter="\n。 tk_nums.append(num_tokens_from_string(text_seg)) return cks, result_images - dels = get_delimiters(delimiter) for text, image in zip(texts, images): # if text is tuple, unpack it if isinstance(text, tuple): text_str = text[0] text_pos = text[1] if len(text) > 1 else "" - split_sec = re.split(r"(%s)" % dels, text_str) - for sub_sec in split_sec: - if re.match(f"^{dels}$", sub_sec): - continue - add_chunk("\n"+sub_sec, image, text_pos) + add_chunk("\n"+text_str, image, text_pos) else: - split_sec = re.split(r"(%s)" % dels, text) - for sub_sec in split_sec: - if re.match(f"^{dels}$", sub_sec): - continue - add_chunk("\n"+sub_sec, image) + add_chunk("\n"+text, image) return cks, result_images + def docx_question_level(p, bull=-1): txt = re.sub(r"\u3000", " ", p.text).strip() if p.style.name.startswith('Heading'): @@ -808,15 +792,8 @@ def naive_merge_docx(sections, chunk_token_num=128, delimiter="\n。;!?"): tk_nums.append(num_tokens_from_string(text_seg)) return cks, images - dels = get_delimiters(delimiter) - pattern = r"(%s)" % dels - for sec, image in sections: - split_sec = re.split(pattern, sec) - for sub_sec in split_sec: - if not sub_sec or re.match(f"^{dels}$", sub_sec): - continue - add_chunk("\n" + sub_sec, image, "") + add_chunk("\n" + sec, image, "") return cks, images @@ -844,6 +821,7 @@ def get_delimiters(delimiters: str): return dels_pattern + class Node: def __init__(self, level, depth=-1, texts=None): self.level = level