From 7be3dacdaa8f7a9e8fee0982576cbf0bc1faf007 Mon Sep 17 00:00:00 2001 From: Magicbook1108 Date: Tue, 3 Feb 2026 09:43:18 +0800 Subject: [PATCH] Fix: custom delimeter in docx (#12946) ### What problem does this PR solve? Fix: custom delimeter in docx ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue) --- rag/graphrag/entity_resolution_prompt.py | 2 +- rag/nlp/__init__.py | 93 +++++++++++++++++++----- 2 files changed, 75 insertions(+), 20 deletions(-) diff --git a/rag/graphrag/entity_resolution_prompt.py b/rag/graphrag/entity_resolution_prompt.py index d7a360dd5..76e9ad3ae 100644 --- a/rag/graphrag/entity_resolution_prompt.py +++ b/rag/graphrag/entity_resolution_prompt.py @@ -51,7 +51,7 @@ Example 2: Question: When determining whether two toponym are the same, you should only focus on critical properties and overlook noisy factors. -Demonstration 1: name of toponym A is : "nanjing", name of toponym B is :"nanjing city" No, toponym A and toponym B are same toponym. +Demonstration 1: name of toponym A is : "nanjing", name of toponym B is :"nanjing city" Yes, toponym A and toponym B are same toponym. Question 1: name of toponym A is : "Chicago", name of toponym B is :"ChiTown" Question 2: name of toponym A is : "Shanghai", name of toponym B is :"Zhengzhou" Question 3: name of toponym A is : "Beijing", name of toponym B is :"Peking" diff --git a/rag/nlp/__init__.py b/rag/nlp/__init__.py index 5ba77ca05..2cd725197 100644 --- a/rag/nlp/__init__.py +++ b/rag/nlp/__init__.py @@ -1242,49 +1242,105 @@ def _build_cks(sections, delimiter): tables = [] images = [] + # extract custom delimiters wrapped by backticks: `##`, `---`, etc. custom_delimiters = [m.group(1) for m in re.finditer(r"`([^`]+)`", delimiter)] has_custom = bool(custom_delimiters) if has_custom: + # escape delimiters and build alternation pattern, longest first custom_pattern = "|".join( re.escape(t) for t in sorted(set(custom_delimiters), key=len, reverse=True) ) + # capture delimiters so they appear in re.split results pattern = r"(%s)" % custom_pattern + seg = "" for text, image, table in sections: - # normalize text + # normalize text: ensure string and prepend newline for continuity if not text: - text = "\n" + text = "" else: text = "\n" + str(text) if table: - # table ck + # table chunk ck_text = text + str(table) idx = len(cks) - cks.append({"text": ck_text, "image": image, "ck_type": "table", "tk_nums": num_tokens_from_string(ck_text)}) + cks.append({ + "text": ck_text, + "image": image, + "ck_type": "table", + "tk_nums": num_tokens_from_string(ck_text), + }) tables.append(idx) continue if image: - # image ck (text can be kept as-is; depends on your downstream) + # image chunk (text kept as-is for context) idx = len(cks) - cks.append({"text": text, "image": image, "ck_type": "image", "tk_nums": num_tokens_from_string(text)}) + cks.append({ + "text": text, + "image": image, + "ck_type": "image", + "tk_nums": num_tokens_from_string(text), + }) images.append(idx) continue - # pure text ck(s) + # pure text chunk(s) if has_custom: split_sec = re.split(pattern, text) for sub_sec in split_sec: - if not sub_sec or re.fullmatch(custom_pattern, sub_sec): + # ① empty or whitespace-only segment → flush current buffer + if not sub_sec or not sub_sec.strip(): + if seg and seg.strip(): + s = seg.strip() + cks.append({ + "text": s, + "image": None, + "ck_type": "text", + "tk_nums": num_tokens_from_string(s), + }) + seg = "" continue - seg = "\n" + sub_sec if not sub_sec.startswith("\n") else sub_sec - cks.append({"text": seg, "image": None, "ck_type": "text", "tk_nums": num_tokens_from_string(seg)}) - else: - cks.append({"text": text, "image": None, "ck_type": "text", "tk_nums": num_tokens_from_string(text)}) - return cks, tables, images + # ② matched custom delimiter (allow surrounding whitespace) + if re.fullmatch(custom_pattern, sub_sec.strip()): + if seg and seg.strip(): + s = seg.strip() + cks.append({ + "text": s, + "image": None, + "ck_type": "text", + "tk_nums": num_tokens_from_string(s), + }) + seg = "" + continue + + # ③ normal text content → accumulate + seg += sub_sec + else: + # no custom delimiter: emit the text as a single chunk + if text and text.strip(): + t = text.strip() + cks.append({ + "text": t, + "image": None, + "ck_type": "text", + "tk_nums": num_tokens_from_string(t), + }) + + # final flush after loop (only when custom delimiters are used) + if has_custom and seg and seg.strip(): + s = seg.strip() + cks.append({ + "text": s, + "image": None, + "ck_type": "text", + "tk_nums": num_tokens_from_string(s), + }) + + return cks, tables, images, has_custom def _add_context(cks, idx, context_size): @@ -1363,7 +1419,7 @@ def _add_context(cks, idx, context_size): cks[idx]["context_below"] = "".join(parts_below) if parts_below else "" -def _merge_cks(cks, chunk_token_num): +def _merge_cks(cks, chunk_token_num, has_custom): merged = [] image_idxs = [] prev_text_ck = -1 @@ -1377,8 +1433,7 @@ def _merge_cks(cks, chunk_token_num): image_idxs.append(len(merged) - 1) continue - - if prev_text_ck<0 or merged[prev_text_ck]["tk_nums"] >= chunk_token_num: + if prev_text_ck<0 or merged[prev_text_ck]["tk_nums"] >= chunk_token_num or has_custom: merged.append(cks[i]) prev_text_ck = len(merged) - 1 continue @@ -1399,7 +1454,7 @@ def naive_merge_docx( if not sections: return [], [] - cks, tables, images = _build_cks(sections, delimiter) + cks, tables, images, has_custom = _build_cks(sections, delimiter) if table_context_size > 0: for i in tables: @@ -1408,8 +1463,8 @@ def naive_merge_docx( if image_context_size > 0: for i in images: _add_context(cks, i, image_context_size) - - merged_cks, merged_image_idx = _merge_cks(cks, chunk_token_num) + + merged_cks, merged_image_idx = _merge_cks(cks, chunk_token_num, has_custom) return merged_cks, merged_image_idx