Fix: custom delimeter in docx (#12946)

### What problem does this PR solve? Fix: custom delimeter in docx ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue)
2026-02-04 01:25:07 +08:00 · 2026-02-03 09:43:18 +08:00
parent 2e5a18602b
commit 7be3dacdaa
2 changed files with 75 additions and 20 deletions
--- a/rag/graphrag/entity_resolution_prompt.py
+++ b/rag/graphrag/entity_resolution_prompt.py
@ -51,7 +51,7 @@ Example 2:
 Question:
 When determining whether two toponym are the same, you should only focus on critical properties and overlook noisy factors. 

-Demonstration 1: name of toponym A is : "nanjing", name of toponym B is :"nanjing city"  No, toponym A and toponym B are same toponym.
+Demonstration 1: name of toponym A is : "nanjing", name of toponym B is :"nanjing city"  Yes, toponym A and toponym B are same toponym.
 Question 1: name of toponym A is : "Chicago", name of toponym B is :"ChiTown"  
 Question 2: name of toponym A is : "Shanghai", name of toponym B is :"Zhengzhou"  
 Question 3: name of toponym A is : "Beijing", name of toponym B is :"Peking"
--- a/rag/nlp/init.py
+++ b/rag/nlp/init.py
@ -1242,49 +1242,105 @@ def _build_cks(sections, delimiter):
    tables = []
    images = []

+    # extract custom delimiters wrapped by backticks: `##`, `---`, etc.
    custom_delimiters = [m.group(1) for m in re.finditer(r"`([^`]+)`", delimiter)]
    has_custom = bool(custom_delimiters)

    if has_custom:
+        # escape delimiters and build alternation pattern, longest first
        custom_pattern = "|".join(
            re.escape(t) for t in sorted(set(custom_delimiters), key=len, reverse=True)
        )
+        # capture delimiters so they appear in re.split results
        pattern = r"(%s)" % custom_pattern

+    seg = ""
    for text, image, table in sections:
-        # normalize text
+        # normalize text: ensure string and prepend newline for continuity
        if not text:
-            text = "\n"
+            text = ""
        else:
            text = "\n" + str(text)

        if table:
-            # table ck
+            # table chunk
            ck_text = text + str(table)
            idx = len(cks)
-            cks.append({"text": ck_text, "image": image, "ck_type": "table", "tk_nums": num_tokens_from_string(ck_text)})
+            cks.append({
+                "text": ck_text,
+                "image": image,
+                "ck_type": "table",
+                "tk_nums": num_tokens_from_string(ck_text),
+            })
            tables.append(idx)
            continue

        if image:
-            # image ck (text can be kept as-is; depends on your downstream)
+            # image chunk (text kept as-is for context)
            idx = len(cks)
-            cks.append({"text": text, "image": image, "ck_type": "image", "tk_nums": num_tokens_from_string(text)})
+            cks.append({
+                "text": text,
+                "image": image,
+                "ck_type": "image",
+                "tk_nums": num_tokens_from_string(text),
+            })
            images.append(idx)
            continue

-        # pure text ck(s)
+        # pure text chunk(s)
        if has_custom:
            split_sec = re.split(pattern, text)
            for sub_sec in split_sec:
-                if not sub_sec or re.fullmatch(custom_pattern, sub_sec):
+                # ① empty or whitespace-only segment → flush current buffer
+                if not sub_sec or not sub_sec.strip():
+                    if seg and seg.strip():
+                        s = seg.strip()
+                        cks.append({
+                            "text": s,
+                            "image": None,
+                            "ck_type": "text",
+                            "tk_nums": num_tokens_from_string(s),
+                        })
+                    seg = ""
                    continue
-                seg = "\n" + sub_sec if not sub_sec.startswith("\n") else sub_sec
-                cks.append({"text": seg, "image": None, "ck_type": "text", "tk_nums": num_tokens_from_string(seg)})
-        else:
-            cks.append({"text": text, "image": None, "ck_type": "text", "tk_nums": num_tokens_from_string(text)})

-    return cks, tables, images
+                # ② matched custom delimiter (allow surrounding whitespace)
+                if re.fullmatch(custom_pattern, sub_sec.strip()):
+                    if seg and seg.strip():
+                        s = seg.strip()
+                        cks.append({
+                            "text": s,
+                            "image": None,
+                            "ck_type": "text",
+                            "tk_nums": num_tokens_from_string(s),
+                        })
+                    seg = ""
+                    continue
+
+                # ③ normal text content → accumulate
+                seg += sub_sec
+        else:
+            # no custom delimiter: emit the text as a single chunk
+            if text and text.strip():
+                t = text.strip()
+                cks.append({
+                    "text": t,
+                    "image": None,
+                    "ck_type": "text",
+                    "tk_nums": num_tokens_from_string(t),
+                })
+
+    # final flush after loop (only when custom delimiters are used)
+    if has_custom and seg and seg.strip():
+        s = seg.strip()
+        cks.append({
+            "text": s,
+            "image": None,
+            "ck_type": "text",
+            "tk_nums": num_tokens_from_string(s),
+        })
+
+    return cks, tables, images, has_custom


 def _add_context(cks, idx, context_size):
@ -1363,7 +1419,7 @@ def _add_context(cks, idx, context_size):
    cks[idx]["context_below"] = "".join(parts_below) if parts_below else ""


-def _merge_cks(cks, chunk_token_num):
+def _merge_cks(cks, chunk_token_num, has_custom):
    merged = []
    image_idxs = []
    prev_text_ck = -1
@ -1377,8 +1433,7 @@ def _merge_cks(cks, chunk_token_num):
                image_idxs.append(len(merged) - 1)
            continue

-
-        if prev_text_ck<0 or merged[prev_text_ck]["tk_nums"] >= chunk_token_num:
+        if prev_text_ck<0 or merged[prev_text_ck]["tk_nums"] >= chunk_token_num or has_custom:
            merged.append(cks[i])
            prev_text_ck = len(merged) - 1
            continue
@ -1399,7 +1454,7 @@ def naive_merge_docx(
    if not sections:
        return [], []

-    cks, tables, images = _build_cks(sections, delimiter)
+    cks, tables, images, has_custom = _build_cks(sections, delimiter)

    if table_context_size > 0:
        for i in tables:
@ -1408,8 +1463,8 @@ def naive_merge_docx(
    if image_context_size > 0:
        for i in images:
            _add_context(cks, i, image_context_size)
-
-    merged_cks, merged_image_idx = _merge_cks(cks, chunk_token_num)
+    
+    merged_cks, merged_image_idx = _merge_cks(cks, chunk_token_num, has_custom)

    return merged_cks, merged_image_idx