Feat: add child parent chunking method in backend. (#11598)

### What problem does this PR solve?

#7996

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
This commit is contained in:
Kevin Hu
2025-11-28 19:25:32 +08:00
committed by GitHub
parent d2915f6984
commit 14616cf845
10 changed files with 216 additions and 130 deletions

View File

@ -264,14 +264,14 @@ def is_chinese(text):
return False
def tokenize(d, t, eng):
d["content_with_weight"] = t
t = re.sub(r"</?(table|td|caption|tr|th)( [^<>]{0,12})?>", " ", t)
def tokenize(d, txt, eng):
d["content_with_weight"] = txt
t = re.sub(r"</?(table|td|caption|tr|th)( [^<>]{0,12})?>", " ", txt)
d["content_ltks"] = rag_tokenizer.tokenize(t)
d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"])
def tokenize_chunks(chunks, doc, eng, pdf_parser=None):
def tokenize_chunks(chunks, doc, eng, pdf_parser=None, child_delimiters_pattern=None):
res = []
# wrap up as es documents
for ii, ck in enumerate(chunks):
@ -288,12 +288,21 @@ def tokenize_chunks(chunks, doc, eng, pdf_parser=None):
pass
else:
add_positions(d, [[ii]*5])
if child_delimiters_pattern:
d["mom_with_weight"] = ck
for txt in re.split(r"(%s)" % child_delimiters_pattern, ck, flags=re.DOTALL):
dd = copy.deepcopy(d)
tokenize(dd, txt, eng)
res.append(dd)
continue
tokenize(d, ck, eng)
res.append(d)
return res
def tokenize_chunks_with_images(chunks, doc, eng, images):
def tokenize_chunks_with_images(chunks, doc, eng, images, child_delimiters_pattern=None):
res = []
# wrap up as es documents
for ii, (ck, image) in enumerate(zip(chunks, images)):
@ -303,6 +312,13 @@ def tokenize_chunks_with_images(chunks, doc, eng, images):
d = copy.deepcopy(doc)
d["image"] = image
add_positions(d, [[ii]*5])
if child_delimiters_pattern:
d["mom_with_weight"] = ck
for txt in re.split(r"(%s)" % child_delimiters_pattern, ck, flags=re.DOTALL):
dd = copy.deepcopy(d)
tokenize(dd, txt, eng)
res.append(dd)
continue
tokenize(d, ck, eng)
res.append(d)
return res