Fix: custom delimeter in docx (#12946)

### What problem does this PR solve?

Fix: custom delimeter in docx

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
This commit is contained in:
Magicbook1108
2026-02-03 09:43:18 +08:00
committed by GitHub
parent 2e5a18602b
commit 7be3dacdaa
2 changed files with 75 additions and 20 deletions

View File

@ -51,7 +51,7 @@ Example 2:
Question: Question:
When determining whether two toponym are the same, you should only focus on critical properties and overlook noisy factors. When determining whether two toponym are the same, you should only focus on critical properties and overlook noisy factors.
Demonstration 1: name of toponym A is : "nanjing", name of toponym B is :"nanjing city" No, toponym A and toponym B are same toponym. Demonstration 1: name of toponym A is : "nanjing", name of toponym B is :"nanjing city" Yes, toponym A and toponym B are same toponym.
Question 1: name of toponym A is : "Chicago", name of toponym B is :"ChiTown" Question 1: name of toponym A is : "Chicago", name of toponym B is :"ChiTown"
Question 2: name of toponym A is : "Shanghai", name of toponym B is :"Zhengzhou" Question 2: name of toponym A is : "Shanghai", name of toponym B is :"Zhengzhou"
Question 3: name of toponym A is : "Beijing", name of toponym B is :"Peking" Question 3: name of toponym A is : "Beijing", name of toponym B is :"Peking"

View File

@ -1242,49 +1242,105 @@ def _build_cks(sections, delimiter):
tables = [] tables = []
images = [] images = []
# extract custom delimiters wrapped by backticks: `##`, `---`, etc.
custom_delimiters = [m.group(1) for m in re.finditer(r"`([^`]+)`", delimiter)] custom_delimiters = [m.group(1) for m in re.finditer(r"`([^`]+)`", delimiter)]
has_custom = bool(custom_delimiters) has_custom = bool(custom_delimiters)
if has_custom: if has_custom:
# escape delimiters and build alternation pattern, longest first
custom_pattern = "|".join( custom_pattern = "|".join(
re.escape(t) for t in sorted(set(custom_delimiters), key=len, reverse=True) re.escape(t) for t in sorted(set(custom_delimiters), key=len, reverse=True)
) )
# capture delimiters so they appear in re.split results
pattern = r"(%s)" % custom_pattern pattern = r"(%s)" % custom_pattern
seg = ""
for text, image, table in sections: for text, image, table in sections:
# normalize text # normalize text: ensure string and prepend newline for continuity
if not text: if not text:
text = "\n" text = ""
else: else:
text = "\n" + str(text) text = "\n" + str(text)
if table: if table:
# table ck # table chunk
ck_text = text + str(table) ck_text = text + str(table)
idx = len(cks) idx = len(cks)
cks.append({"text": ck_text, "image": image, "ck_type": "table", "tk_nums": num_tokens_from_string(ck_text)}) cks.append({
"text": ck_text,
"image": image,
"ck_type": "table",
"tk_nums": num_tokens_from_string(ck_text),
})
tables.append(idx) tables.append(idx)
continue continue
if image: if image:
# image ck (text can be kept as-is; depends on your downstream) # image chunk (text kept as-is for context)
idx = len(cks) idx = len(cks)
cks.append({"text": text, "image": image, "ck_type": "image", "tk_nums": num_tokens_from_string(text)}) cks.append({
"text": text,
"image": image,
"ck_type": "image",
"tk_nums": num_tokens_from_string(text),
})
images.append(idx) images.append(idx)
continue continue
# pure text ck(s) # pure text chunk(s)
if has_custom: if has_custom:
split_sec = re.split(pattern, text) split_sec = re.split(pattern, text)
for sub_sec in split_sec: for sub_sec in split_sec:
if not sub_sec or re.fullmatch(custom_pattern, sub_sec): # ① empty or whitespace-only segment → flush current buffer
if not sub_sec or not sub_sec.strip():
if seg and seg.strip():
s = seg.strip()
cks.append({
"text": s,
"image": None,
"ck_type": "text",
"tk_nums": num_tokens_from_string(s),
})
seg = ""
continue continue
seg = "\n" + sub_sec if not sub_sec.startswith("\n") else sub_sec
cks.append({"text": seg, "image": None, "ck_type": "text", "tk_nums": num_tokens_from_string(seg)})
else:
cks.append({"text": text, "image": None, "ck_type": "text", "tk_nums": num_tokens_from_string(text)})
return cks, tables, images # ② matched custom delimiter (allow surrounding whitespace)
if re.fullmatch(custom_pattern, sub_sec.strip()):
if seg and seg.strip():
s = seg.strip()
cks.append({
"text": s,
"image": None,
"ck_type": "text",
"tk_nums": num_tokens_from_string(s),
})
seg = ""
continue
# ③ normal text content → accumulate
seg += sub_sec
else:
# no custom delimiter: emit the text as a single chunk
if text and text.strip():
t = text.strip()
cks.append({
"text": t,
"image": None,
"ck_type": "text",
"tk_nums": num_tokens_from_string(t),
})
# final flush after loop (only when custom delimiters are used)
if has_custom and seg and seg.strip():
s = seg.strip()
cks.append({
"text": s,
"image": None,
"ck_type": "text",
"tk_nums": num_tokens_from_string(s),
})
return cks, tables, images, has_custom
def _add_context(cks, idx, context_size): def _add_context(cks, idx, context_size):
@ -1363,7 +1419,7 @@ def _add_context(cks, idx, context_size):
cks[idx]["context_below"] = "".join(parts_below) if parts_below else "" cks[idx]["context_below"] = "".join(parts_below) if parts_below else ""
def _merge_cks(cks, chunk_token_num): def _merge_cks(cks, chunk_token_num, has_custom):
merged = [] merged = []
image_idxs = [] image_idxs = []
prev_text_ck = -1 prev_text_ck = -1
@ -1377,8 +1433,7 @@ def _merge_cks(cks, chunk_token_num):
image_idxs.append(len(merged) - 1) image_idxs.append(len(merged) - 1)
continue continue
if prev_text_ck<0 or merged[prev_text_ck]["tk_nums"] >= chunk_token_num or has_custom:
if prev_text_ck<0 or merged[prev_text_ck]["tk_nums"] >= chunk_token_num:
merged.append(cks[i]) merged.append(cks[i])
prev_text_ck = len(merged) - 1 prev_text_ck = len(merged) - 1
continue continue
@ -1399,7 +1454,7 @@ def naive_merge_docx(
if not sections: if not sections:
return [], [] return [], []
cks, tables, images = _build_cks(sections, delimiter) cks, tables, images, has_custom = _build_cks(sections, delimiter)
if table_context_size > 0: if table_context_size > 0:
for i in tables: for i in tables:
@ -1409,7 +1464,7 @@ def naive_merge_docx(
for i in images: for i in images:
_add_context(cks, i, image_context_size) _add_context(cks, i, image_context_size)
merged_cks, merged_image_idx = _merge_cks(cks, chunk_token_num) merged_cks, merged_image_idx = _merge_cks(cks, chunk_token_num, has_custom)
return merged_cks, merged_image_idx return merged_cks, merged_image_idx