mirror of
https://github.com/infiniflow/ragflow.git
synced 2025-12-26 00:46:52 +08:00
Feat: add child parent chunking method in backend. (#11598)
### What problem does this PR solve? #7996 ### Type of change - [x] New Feature (non-breaking change which adds functionality)
This commit is contained in:
@ -264,14 +264,14 @@ def is_chinese(text):
|
||||
return False
|
||||
|
||||
|
||||
def tokenize(d, t, eng):
|
||||
d["content_with_weight"] = t
|
||||
t = re.sub(r"</?(table|td|caption|tr|th)( [^<>]{0,12})?>", " ", t)
|
||||
def tokenize(d, txt, eng):
|
||||
d["content_with_weight"] = txt
|
||||
t = re.sub(r"</?(table|td|caption|tr|th)( [^<>]{0,12})?>", " ", txt)
|
||||
d["content_ltks"] = rag_tokenizer.tokenize(t)
|
||||
d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"])
|
||||
|
||||
|
||||
def tokenize_chunks(chunks, doc, eng, pdf_parser=None):
|
||||
def tokenize_chunks(chunks, doc, eng, pdf_parser=None, child_delimiters_pattern=None):
|
||||
res = []
|
||||
# wrap up as es documents
|
||||
for ii, ck in enumerate(chunks):
|
||||
@ -288,12 +288,21 @@ def tokenize_chunks(chunks, doc, eng, pdf_parser=None):
|
||||
pass
|
||||
else:
|
||||
add_positions(d, [[ii]*5])
|
||||
|
||||
if child_delimiters_pattern:
|
||||
d["mom_with_weight"] = ck
|
||||
for txt in re.split(r"(%s)" % child_delimiters_pattern, ck, flags=re.DOTALL):
|
||||
dd = copy.deepcopy(d)
|
||||
tokenize(dd, txt, eng)
|
||||
res.append(dd)
|
||||
continue
|
||||
|
||||
tokenize(d, ck, eng)
|
||||
res.append(d)
|
||||
return res
|
||||
|
||||
|
||||
def tokenize_chunks_with_images(chunks, doc, eng, images):
|
||||
def tokenize_chunks_with_images(chunks, doc, eng, images, child_delimiters_pattern=None):
|
||||
res = []
|
||||
# wrap up as es documents
|
||||
for ii, (ck, image) in enumerate(zip(chunks, images)):
|
||||
@ -303,6 +312,13 @@ def tokenize_chunks_with_images(chunks, doc, eng, images):
|
||||
d = copy.deepcopy(doc)
|
||||
d["image"] = image
|
||||
add_positions(d, [[ii]*5])
|
||||
if child_delimiters_pattern:
|
||||
d["mom_with_weight"] = ck
|
||||
for txt in re.split(r"(%s)" % child_delimiters_pattern, ck, flags=re.DOTALL):
|
||||
dd = copy.deepcopy(d)
|
||||
tokenize(dd, txt, eng)
|
||||
res.append(dd)
|
||||
continue
|
||||
tokenize(d, ck, eng)
|
||||
res.append(d)
|
||||
return res
|
||||
|
||||
Reference in New Issue
Block a user