mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-02-04 01:25:07 +08:00
Fix: custom delimeter in docx (#12946)
### What problem does this PR solve? Fix: custom delimeter in docx ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue)
This commit is contained in:
@ -51,7 +51,7 @@ Example 2:
|
|||||||
Question:
|
Question:
|
||||||
When determining whether two toponym are the same, you should only focus on critical properties and overlook noisy factors.
|
When determining whether two toponym are the same, you should only focus on critical properties and overlook noisy factors.
|
||||||
|
|
||||||
Demonstration 1: name of toponym A is : "nanjing", name of toponym B is :"nanjing city" No, toponym A and toponym B are same toponym.
|
Demonstration 1: name of toponym A is : "nanjing", name of toponym B is :"nanjing city" Yes, toponym A and toponym B are same toponym.
|
||||||
Question 1: name of toponym A is : "Chicago", name of toponym B is :"ChiTown"
|
Question 1: name of toponym A is : "Chicago", name of toponym B is :"ChiTown"
|
||||||
Question 2: name of toponym A is : "Shanghai", name of toponym B is :"Zhengzhou"
|
Question 2: name of toponym A is : "Shanghai", name of toponym B is :"Zhengzhou"
|
||||||
Question 3: name of toponym A is : "Beijing", name of toponym B is :"Peking"
|
Question 3: name of toponym A is : "Beijing", name of toponym B is :"Peking"
|
||||||
|
|||||||
@ -1242,49 +1242,105 @@ def _build_cks(sections, delimiter):
|
|||||||
tables = []
|
tables = []
|
||||||
images = []
|
images = []
|
||||||
|
|
||||||
|
# extract custom delimiters wrapped by backticks: `##`, `---`, etc.
|
||||||
custom_delimiters = [m.group(1) for m in re.finditer(r"`([^`]+)`", delimiter)]
|
custom_delimiters = [m.group(1) for m in re.finditer(r"`([^`]+)`", delimiter)]
|
||||||
has_custom = bool(custom_delimiters)
|
has_custom = bool(custom_delimiters)
|
||||||
|
|
||||||
if has_custom:
|
if has_custom:
|
||||||
|
# escape delimiters and build alternation pattern, longest first
|
||||||
custom_pattern = "|".join(
|
custom_pattern = "|".join(
|
||||||
re.escape(t) for t in sorted(set(custom_delimiters), key=len, reverse=True)
|
re.escape(t) for t in sorted(set(custom_delimiters), key=len, reverse=True)
|
||||||
)
|
)
|
||||||
|
# capture delimiters so they appear in re.split results
|
||||||
pattern = r"(%s)" % custom_pattern
|
pattern = r"(%s)" % custom_pattern
|
||||||
|
|
||||||
|
seg = ""
|
||||||
for text, image, table in sections:
|
for text, image, table in sections:
|
||||||
# normalize text
|
# normalize text: ensure string and prepend newline for continuity
|
||||||
if not text:
|
if not text:
|
||||||
text = "\n"
|
text = ""
|
||||||
else:
|
else:
|
||||||
text = "\n" + str(text)
|
text = "\n" + str(text)
|
||||||
|
|
||||||
if table:
|
if table:
|
||||||
# table ck
|
# table chunk
|
||||||
ck_text = text + str(table)
|
ck_text = text + str(table)
|
||||||
idx = len(cks)
|
idx = len(cks)
|
||||||
cks.append({"text": ck_text, "image": image, "ck_type": "table", "tk_nums": num_tokens_from_string(ck_text)})
|
cks.append({
|
||||||
|
"text": ck_text,
|
||||||
|
"image": image,
|
||||||
|
"ck_type": "table",
|
||||||
|
"tk_nums": num_tokens_from_string(ck_text),
|
||||||
|
})
|
||||||
tables.append(idx)
|
tables.append(idx)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
if image:
|
if image:
|
||||||
# image ck (text can be kept as-is; depends on your downstream)
|
# image chunk (text kept as-is for context)
|
||||||
idx = len(cks)
|
idx = len(cks)
|
||||||
cks.append({"text": text, "image": image, "ck_type": "image", "tk_nums": num_tokens_from_string(text)})
|
cks.append({
|
||||||
|
"text": text,
|
||||||
|
"image": image,
|
||||||
|
"ck_type": "image",
|
||||||
|
"tk_nums": num_tokens_from_string(text),
|
||||||
|
})
|
||||||
images.append(idx)
|
images.append(idx)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# pure text ck(s)
|
# pure text chunk(s)
|
||||||
if has_custom:
|
if has_custom:
|
||||||
split_sec = re.split(pattern, text)
|
split_sec = re.split(pattern, text)
|
||||||
for sub_sec in split_sec:
|
for sub_sec in split_sec:
|
||||||
if not sub_sec or re.fullmatch(custom_pattern, sub_sec):
|
# ① empty or whitespace-only segment → flush current buffer
|
||||||
|
if not sub_sec or not sub_sec.strip():
|
||||||
|
if seg and seg.strip():
|
||||||
|
s = seg.strip()
|
||||||
|
cks.append({
|
||||||
|
"text": s,
|
||||||
|
"image": None,
|
||||||
|
"ck_type": "text",
|
||||||
|
"tk_nums": num_tokens_from_string(s),
|
||||||
|
})
|
||||||
|
seg = ""
|
||||||
continue
|
continue
|
||||||
seg = "\n" + sub_sec if not sub_sec.startswith("\n") else sub_sec
|
|
||||||
cks.append({"text": seg, "image": None, "ck_type": "text", "tk_nums": num_tokens_from_string(seg)})
|
|
||||||
else:
|
|
||||||
cks.append({"text": text, "image": None, "ck_type": "text", "tk_nums": num_tokens_from_string(text)})
|
|
||||||
|
|
||||||
return cks, tables, images
|
# ② matched custom delimiter (allow surrounding whitespace)
|
||||||
|
if re.fullmatch(custom_pattern, sub_sec.strip()):
|
||||||
|
if seg and seg.strip():
|
||||||
|
s = seg.strip()
|
||||||
|
cks.append({
|
||||||
|
"text": s,
|
||||||
|
"image": None,
|
||||||
|
"ck_type": "text",
|
||||||
|
"tk_nums": num_tokens_from_string(s),
|
||||||
|
})
|
||||||
|
seg = ""
|
||||||
|
continue
|
||||||
|
|
||||||
|
# ③ normal text content → accumulate
|
||||||
|
seg += sub_sec
|
||||||
|
else:
|
||||||
|
# no custom delimiter: emit the text as a single chunk
|
||||||
|
if text and text.strip():
|
||||||
|
t = text.strip()
|
||||||
|
cks.append({
|
||||||
|
"text": t,
|
||||||
|
"image": None,
|
||||||
|
"ck_type": "text",
|
||||||
|
"tk_nums": num_tokens_from_string(t),
|
||||||
|
})
|
||||||
|
|
||||||
|
# final flush after loop (only when custom delimiters are used)
|
||||||
|
if has_custom and seg and seg.strip():
|
||||||
|
s = seg.strip()
|
||||||
|
cks.append({
|
||||||
|
"text": s,
|
||||||
|
"image": None,
|
||||||
|
"ck_type": "text",
|
||||||
|
"tk_nums": num_tokens_from_string(s),
|
||||||
|
})
|
||||||
|
|
||||||
|
return cks, tables, images, has_custom
|
||||||
|
|
||||||
|
|
||||||
def _add_context(cks, idx, context_size):
|
def _add_context(cks, idx, context_size):
|
||||||
@ -1363,7 +1419,7 @@ def _add_context(cks, idx, context_size):
|
|||||||
cks[idx]["context_below"] = "".join(parts_below) if parts_below else ""
|
cks[idx]["context_below"] = "".join(parts_below) if parts_below else ""
|
||||||
|
|
||||||
|
|
||||||
def _merge_cks(cks, chunk_token_num):
|
def _merge_cks(cks, chunk_token_num, has_custom):
|
||||||
merged = []
|
merged = []
|
||||||
image_idxs = []
|
image_idxs = []
|
||||||
prev_text_ck = -1
|
prev_text_ck = -1
|
||||||
@ -1377,8 +1433,7 @@ def _merge_cks(cks, chunk_token_num):
|
|||||||
image_idxs.append(len(merged) - 1)
|
image_idxs.append(len(merged) - 1)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
if prev_text_ck<0 or merged[prev_text_ck]["tk_nums"] >= chunk_token_num or has_custom:
|
||||||
if prev_text_ck<0 or merged[prev_text_ck]["tk_nums"] >= chunk_token_num:
|
|
||||||
merged.append(cks[i])
|
merged.append(cks[i])
|
||||||
prev_text_ck = len(merged) - 1
|
prev_text_ck = len(merged) - 1
|
||||||
continue
|
continue
|
||||||
@ -1399,7 +1454,7 @@ def naive_merge_docx(
|
|||||||
if not sections:
|
if not sections:
|
||||||
return [], []
|
return [], []
|
||||||
|
|
||||||
cks, tables, images = _build_cks(sections, delimiter)
|
cks, tables, images, has_custom = _build_cks(sections, delimiter)
|
||||||
|
|
||||||
if table_context_size > 0:
|
if table_context_size > 0:
|
||||||
for i in tables:
|
for i in tables:
|
||||||
@ -1409,7 +1464,7 @@ def naive_merge_docx(
|
|||||||
for i in images:
|
for i in images:
|
||||||
_add_context(cks, i, image_context_size)
|
_add_context(cks, i, image_context_size)
|
||||||
|
|
||||||
merged_cks, merged_image_idx = _merge_cks(cks, chunk_token_num)
|
merged_cks, merged_image_idx = _merge_cks(cks, chunk_token_num, has_custom)
|
||||||
|
|
||||||
return merged_cks, merged_image_idx
|
return merged_cks, merged_image_idx
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user