Feat: add child parent chunking method in backend. (#11598)

### What problem does this PR solve? #7996 ### Type of change - [x] New Feature (non-breaking change which adds functionality)
2025-12-26 00:46:52 +08:00 · 2025-11-28 19:25:32 +08:00
parent d2915f6984
commit 14616cf845
10 changed files with 216 additions and 130 deletions
--- a/rag/nlp/init.py
+++ b/rag/nlp/init.py
@ -264,14 +264,14 @@ def is_chinese(text):
    return False


-def tokenize(d, t, eng):
-    d["content_with_weight"] = t
-    t = re.sub(r"</?(table|td|caption|tr|th)( [^<>]{0,12})?>", " ", t)
+def tokenize(d, txt, eng):
+    d["content_with_weight"] = txt
+    t = re.sub(r"</?(table|td|caption|tr|th)( [^<>]{0,12})?>", " ", txt)
    d["content_ltks"] = rag_tokenizer.tokenize(t)
    d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"])


-def tokenize_chunks(chunks, doc, eng, pdf_parser=None):
+def tokenize_chunks(chunks, doc, eng, pdf_parser=None, child_delimiters_pattern=None):
    res = []
    # wrap up as es documents
    for ii, ck in enumerate(chunks):
@ -288,12 +288,21 @@ def tokenize_chunks(chunks, doc, eng, pdf_parser=None):
                pass
        else:
            add_positions(d, [[ii]*5])
+
+        if child_delimiters_pattern:
+            d["mom_with_weight"] = ck
+            for txt in re.split(r"(%s)" % child_delimiters_pattern, ck, flags=re.DOTALL):
+                dd = copy.deepcopy(d)
+                tokenize(dd, txt, eng)
+                res.append(dd)
+            continue
+
        tokenize(d, ck, eng)
        res.append(d)
    return res


-def tokenize_chunks_with_images(chunks, doc, eng, images):
+def tokenize_chunks_with_images(chunks, doc, eng, images, child_delimiters_pattern=None):
    res = []
    # wrap up as es documents
    for ii, (ck, image) in enumerate(zip(chunks, images)):
@ -303,6 +312,13 @@ def tokenize_chunks_with_images(chunks, doc, eng, images):
        d = copy.deepcopy(doc)
        d["image"] = image
        add_positions(d, [[ii]*5])
+        if child_delimiters_pattern:
+            d["mom_with_weight"] = ck
+            for txt in re.split(r"(%s)" % child_delimiters_pattern, ck, flags=re.DOTALL):
+                dd = copy.deepcopy(d)
+                tokenize(dd, txt, eng)
+                res.append(dd)
+            continue
        tokenize(d, ck, eng)
        res.append(d)
    return res