Fix: parent-children chunking method. (#11997)

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
- [x] New Feature (non-breaking change which adds functionality)
This commit is contained in:
Kevin Hu
2025-12-17 16:50:36 +08:00
committed by GitHub
parent 7baa67dfe8
commit 8e4d011b15
10 changed files with 160 additions and 57 deletions

View File

@ -273,6 +273,21 @@ def tokenize(d, txt, eng):
d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"])
def split_with_pattern(d, pattern:str, content:str, eng) -> list:
docs = []
txts = [txt for txt in re.split(r"(%s)" % pattern, content, flags=re.DOTALL)]
for j in range(0, len(txts), 2):
txt = txts[j]
if not txt:
continue
if j + 1 < len(txts):
txt += txts[j+1]
dd = copy.deepcopy(d)
tokenize(dd, txt, eng)
docs.append(dd)
return docs
def tokenize_chunks(chunks, doc, eng, pdf_parser=None, child_delimiters_pattern=None):
res = []
# wrap up as es documents
@ -293,10 +308,7 @@ def tokenize_chunks(chunks, doc, eng, pdf_parser=None, child_delimiters_pattern=
if child_delimiters_pattern:
d["mom_with_weight"] = ck
for txt in re.split(r"(%s)" % child_delimiters_pattern, ck, flags=re.DOTALL):
dd = copy.deepcopy(d)
tokenize(dd, txt, eng)
res.append(dd)
res.extend(split_with_pattern(d, child_delimiters_pattern, ck, eng))
continue
tokenize(d, ck, eng)
@ -316,10 +328,7 @@ def tokenize_chunks_with_images(chunks, doc, eng, images, child_delimiters_patte
add_positions(d, [[ii]*5])
if child_delimiters_pattern:
d["mom_with_weight"] = ck
for txt in re.split(r"(%s)" % child_delimiters_pattern, ck, flags=re.DOTALL):
dd = copy.deepcopy(d)
tokenize(dd, txt, eng)
res.append(dd)
res.extend(split_with_pattern(d, child_delimiters_pattern, ck, eng))
continue
tokenize(d, ck, eng)
res.append(d)