From 7be3dacdaa8f7a9e8fee0982576cbf0bc1faf007 Mon Sep 17 00:00:00 2001
From: Magicbook1108 <newyorkupperbay@gmail.com>
Date: Tue, 3 Feb 2026 09:43:18 +0800
Subject: [PATCH] Fix: custom delimeter in docx (#12946)

### What problem does this PR solve?

Fix: custom delimeter in docx

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
---
 rag/graphrag/entity_resolution_prompt.py |  2 +-
 rag/nlp/__init__.py                      | 93 +++++++++++++++++++-----
 2 files changed, 75 insertions(+), 20 deletions(-)

diff --git a/rag/graphrag/entity_resolution_prompt.py b/rag/graphrag/entity_resolution_prompt.py
index d7a360dd5..76e9ad3ae 100644
--- a/rag/graphrag/entity_resolution_prompt.py
+++ b/rag/graphrag/entity_resolution_prompt.py
@@ -51,7 +51,7 @@ Example 2:
 Question:
 When determining whether two toponym are the same, you should only focus on critical properties and overlook noisy factors. 
 
-Demonstration 1: name of toponym A is : "nanjing", name of toponym B is :"nanjing city"  No, toponym A and toponym B are same toponym.
+Demonstration 1: name of toponym A is : "nanjing", name of toponym B is :"nanjing city"  Yes, toponym A and toponym B are same toponym.
 Question 1: name of toponym A is : "Chicago", name of toponym B is :"ChiTown"  
 Question 2: name of toponym A is : "Shanghai", name of toponym B is :"Zhengzhou"  
 Question 3: name of toponym A is : "Beijing", name of toponym B is :"Peking"
diff --git a/rag/nlp/__init__.py b/rag/nlp/__init__.py
index 5ba77ca05..2cd725197 100644
--- a/rag/nlp/__init__.py
+++ b/rag/nlp/__init__.py
@@ -1242,49 +1242,105 @@ def _build_cks(sections, delimiter):
     tables = []
     images = []
 
+    # extract custom delimiters wrapped by backticks: `##`, `---`, etc.
     custom_delimiters = [m.group(1) for m in re.finditer(r"`([^`]+)`", delimiter)]
     has_custom = bool(custom_delimiters)
 
     if has_custom:
+        # escape delimiters and build alternation pattern, longest first
         custom_pattern = "|".join(
             re.escape(t) for t in sorted(set(custom_delimiters), key=len, reverse=True)
         )
+        # capture delimiters so they appear in re.split results
         pattern = r"(%s)" % custom_pattern
 
+    seg = ""
     for text, image, table in sections:
-        # normalize text
+        # normalize text: ensure string and prepend newline for continuity
         if not text:
-            text = "\n"
+            text = ""
         else:
             text = "\n" + str(text)
 
         if table:
-            # table ck
+            # table chunk
             ck_text = text + str(table)
             idx = len(cks)
-            cks.append({"text": ck_text, "image": image, "ck_type": "table", "tk_nums": num_tokens_from_string(ck_text)})
+            cks.append({
+                "text": ck_text,
+                "image": image,
+                "ck_type": "table",
+                "tk_nums": num_tokens_from_string(ck_text),
+            })
             tables.append(idx)
             continue
 
         if image:
-            # image ck (text can be kept as-is; depends on your downstream)
+            # image chunk (text kept as-is for context)
             idx = len(cks)
-            cks.append({"text": text, "image": image, "ck_type": "image", "tk_nums": num_tokens_from_string(text)})
+            cks.append({
+                "text": text,
+                "image": image,
+                "ck_type": "image",
+                "tk_nums": num_tokens_from_string(text),
+            })
             images.append(idx)
             continue
 
-        # pure text ck(s)
+        # pure text chunk(s)
         if has_custom:
             split_sec = re.split(pattern, text)
             for sub_sec in split_sec:
-                if not sub_sec or re.fullmatch(custom_pattern, sub_sec):
+                # ① empty or whitespace-only segment → flush current buffer
+                if not sub_sec or not sub_sec.strip():
+                    if seg and seg.strip():
+                        s = seg.strip()
+                        cks.append({
+                            "text": s,
+                            "image": None,
+                            "ck_type": "text",
+                            "tk_nums": num_tokens_from_string(s),
+                        })
+                    seg = ""
                     continue
-                seg = "\n" + sub_sec if not sub_sec.startswith("\n") else sub_sec
-                cks.append({"text": seg, "image": None, "ck_type": "text", "tk_nums": num_tokens_from_string(seg)})
-        else:
-            cks.append({"text": text, "image": None, "ck_type": "text", "tk_nums": num_tokens_from_string(text)})
 
-    return cks, tables, images
+                # ② matched custom delimiter (allow surrounding whitespace)
+                if re.fullmatch(custom_pattern, sub_sec.strip()):
+                    if seg and seg.strip():
+                        s = seg.strip()
+                        cks.append({
+                            "text": s,
+                            "image": None,
+                            "ck_type": "text",
+                            "tk_nums": num_tokens_from_string(s),
+                        })
+                    seg = ""
+                    continue
+
+                # ③ normal text content → accumulate
+                seg += sub_sec
+        else:
+            # no custom delimiter: emit the text as a single chunk
+            if text and text.strip():
+                t = text.strip()
+                cks.append({
+                    "text": t,
+                    "image": None,
+                    "ck_type": "text",
+                    "tk_nums": num_tokens_from_string(t),
+                })
+
+    # final flush after loop (only when custom delimiters are used)
+    if has_custom and seg and seg.strip():
+        s = seg.strip()
+        cks.append({
+            "text": s,
+            "image": None,
+            "ck_type": "text",
+            "tk_nums": num_tokens_from_string(s),
+        })
+
+    return cks, tables, images, has_custom
 
 
 def _add_context(cks, idx, context_size):
@@ -1363,7 +1419,7 @@ def _add_context(cks, idx, context_size):
     cks[idx]["context_below"] = "".join(parts_below) if parts_below else ""
 
 
-def _merge_cks(cks, chunk_token_num):
+def _merge_cks(cks, chunk_token_num, has_custom):
     merged = []
     image_idxs = []
     prev_text_ck = -1
@@ -1377,8 +1433,7 @@ def _merge_cks(cks, chunk_token_num):
                 image_idxs.append(len(merged) - 1)
             continue
 
-
-        if prev_text_ck<0 or merged[prev_text_ck]["tk_nums"] >= chunk_token_num:
+        if prev_text_ck<0 or merged[prev_text_ck]["tk_nums"] >= chunk_token_num or has_custom:
             merged.append(cks[i])
             prev_text_ck = len(merged) - 1
             continue
@@ -1399,7 +1454,7 @@ def naive_merge_docx(
     if not sections:
         return [], []
 
-    cks, tables, images = _build_cks(sections, delimiter)
+    cks, tables, images, has_custom = _build_cks(sections, delimiter)
 
     if table_context_size > 0:
         for i in tables:
@@ -1408,8 +1463,8 @@ def naive_merge_docx(
     if image_context_size > 0:
         for i in images:
             _add_context(cks, i, image_context_size)
-
-    merged_cks, merged_image_idx = _merge_cks(cks, chunk_token_num)
+    
+    merged_cks, merged_image_idx = _merge_cks(cks, chunk_token_num, has_custom)
 
     return merged_cks, merged_image_idx