Feat: add child parent chunking method in backend. (#11598)

### What problem does this PR solve? #7996 ### Type of change - [x] New Feature (non-breaking change which adds functionality)
2026-01-03 19:15:30 +08:00 · 2025-11-28 19:25:32 +08:00
parent d2915f6984
commit 14616cf845
10 changed files with 216 additions and 130 deletions
--- a/rag/app/naive.py
+++ b/rag/app/naive.py
@ -39,6 +39,7 @@ from deepdoc.parser.docling_parser import DoclingParser
 from deepdoc.parser.tcadp_parser import TCADPParser
 from rag.nlp import concat_img, find_codec, naive_merge, naive_merge_with_images, naive_merge_docx, rag_tokenizer, tokenize_chunks, tokenize_chunks_with_images, tokenize_table, attach_media_context

+
 def by_deepdoc(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, pdf_cls = None ,**kwargs):
    callback = callback
    binary = binary
@ -600,8 +601,7 @@ def load_from_xml_v2(baseURI, rels_item_xml):
            srels._srels.append(_SerializedRelationship(baseURI, rel_elm))
    return srels

-def chunk(filename, binary=None, from_page=0, to_page=100000,
-          lang="Chinese", callback=None, **kwargs):
+def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, **kwargs):
    """
        Supported file formats are docx, pdf, excel, txt.
        This method apply the naive ways to chunk files.
@ -611,14 +611,18 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
    urls = set()
    url_res = []

-
    is_english = lang.lower() == "english"  # is_english(cks)
    parser_config = kwargs.get(
        "parser_config", {
            "chunk_token_num": 512, "delimiter": "\n!?。；！？", "layout_recognize": "DeepDOC", "analyze_hyperlink": True})
+
+    child_deli = re.findall(r"`([^`]+)`", parser_config.get("children_delimiter", ""))
+    child_deli = sorted(set(child_deli), key=lambda x: -len(x))
+    child_deli = "|".join(re.escape(t) for t in child_deli if t)
+    is_markdown = False
    table_context_size = max(0, int(parser_config.get("table_context_size", 0) or 0))
    image_context_size = max(0, int(parser_config.get("image_context_size", 0) or 0))
-    final_sections = False
+
    doc = {
        "docnm_kwd": filename,
        "title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))
@ -679,12 +683,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
                "chunk_token_num", 128)), parser_config.get(
                "delimiter", "\n!?。；！？"))

-        if kwargs.get("section_only", False):
-            chunks.extend(embed_res)
-            chunks.extend(url_res)
-            return chunks
-
-        res.extend(tokenize_chunks_with_images(chunks, doc, is_english, images))
+        res.extend(tokenize_chunks_with_images(chunks, doc, is_english, images, child_delimiters_pattern=child_deli))
        logging.info("naive_merge({}): {}".format(filename, timer() - st))
        res.extend(embed_res)
        res.extend(url_res)
@ -780,7 +779,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
            return_section_images=True,
        )

-        final_sections = True
+        is_markdown = True

        try:
            vision_model = LLMBundle(kwargs["tenant_id"], LLMType.IMAGE2TEXT)
@ -857,7 +856,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
            "file type not supported yet(pdf, xlsx, doc, docx, txt supported)")

    st = timer()
-    if final_sections:
+    if is_markdown:
        merged_chunks = []
        merged_images = []
        chunk_limit = max(0, int(parser_config.get("chunk_token_num", 128)))
@ -900,13 +899,11 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,

        chunks = merged_chunks
        has_images = merged_images and any(img is not None for img in merged_images)
-        if kwargs.get("section_only", False):
-            chunks.extend(embed_res)
-            return chunks
+
        if has_images:
-            res.extend(tokenize_chunks_with_images(chunks, doc, is_english, merged_images))
+            res.extend(tokenize_chunks_with_images(chunks, doc, is_english, merged_images, child_delimiters_pattern=child_deli))
        else:
-            res.extend(tokenize_chunks(chunks, doc, is_english, pdf_parser))
+            res.extend(tokenize_chunks(chunks, doc, is_english, pdf_parser, child_delimiters_pattern=child_deli))
    else:
        if section_images:
            if all(image is None for image in section_images):
@ -917,21 +914,14 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
                                            int(parser_config.get(
                                                "chunk_token_num", 128)), parser_config.get(
                                                "delimiter", "\n!?。；！？"))
-            if kwargs.get("section_only", False):
-                chunks.extend(embed_res)
-                return chunks
-
-            res.extend(tokenize_chunks_with_images(chunks, doc, is_english, images))
+            res.extend(tokenize_chunks_with_images(chunks, doc, is_english, images, child_delimiters_pattern=child_deli))
        else:
            chunks = naive_merge(
                sections, int(parser_config.get(
                    "chunk_token_num", 128)), parser_config.get(
                    "delimiter", "\n!?。；！？"))
-            if kwargs.get("section_only", False):
-                chunks.extend(embed_res)
-                return chunks

-            res.extend(tokenize_chunks(chunks, doc, is_english, pdf_parser))
+            res.extend(tokenize_chunks(chunks, doc, is_english, pdf_parser, child_delimiters_pattern=child_deli))

    if urls and parser_config.get("analyze_hyperlink", False) and is_root:
        for index, url in enumerate(urls):
--- a/rag/flow/splitter/splitter.py
+++ b/rag/flow/splitter/splitter.py
@ -13,10 +13,10 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 import random
+import re
+from copy import deepcopy
 from functools import partial
-
 import trio
-
 from common.misc_utils import get_uuid
 from rag.utils.base64_image import id2image, image2id
 from deepdoc.parser.pdf_parser import RAGFlowPdfParser
@ -32,6 +32,7 @@ class SplitterParam(ProcessParamBase):
        self.chunk_token_size = 512
        self.delimiters = ["\n"]
        self.overlapped_percent = 0
+        self.children_delimiters = []

    def check(self):
        self.check_empty(self.delimiters, "Delimiters.")
@ -58,6 +59,14 @@ class Splitter(ProcessBase):
                deli += f"`{d}`"
            else:
                deli += d
+        child_deli = ""
+        for d in self._param.children_delimiters:
+            if len(d) > 1:
+                child_deli += f"`{d}`"
+            else:
+                child_deli += d
+        child_deli = [m.group(1) for m in re.finditer(r"`([^`]+)`", child_deli)]
+        custom_pattern = "|".join(re.escape(t) for t in sorted(set(child_deli), key=len, reverse=True))

        self.set_output("output_format", "chunks")
        self.callback(random.randint(1, 5) / 100.0, "Start to split into chunks.")
@ -78,7 +87,23 @@ class Splitter(ProcessBase):
                deli,
                self._param.overlapped_percent,
            )
-            self.set_output("chunks", [{"text": c.strip()} for c in cks if c.strip()])
+            if custom_pattern:
+                docs = []
+                for c in cks:
+                    if not c.strip():
+                        continue
+                    split_sec = re.split(r"(%s)" % custom_pattern, c, flags=re.DOTALL)
+                    if split_sec:
+                        for txt in split_sec:
+                            docs.append({
+                                "text": txt,
+                                "mom": c
+                            })
+                    else:
+                        docs.append({"text": c})
+                self.set_output("chunks", docs)
+            else:
+                self.set_output("chunks", [{"text": c.strip()} for c in cks if c.strip()])

            self.callback(1, "Done.")
            return
@ -100,12 +125,27 @@ class Splitter(ProcessBase):
            {
                "text": RAGFlowPdfParser.remove_tag(c),
                "image": img,
-                "positions": [[pos[0][-1]+1, *pos[1:]] for pos in RAGFlowPdfParser.extract_positions(c)],
+                "positions": [[pos[0][-1]+1, *pos[1:]] for pos in RAGFlowPdfParser.extract_positions(c)]
            }
            for c, img in zip(chunks, images) if c.strip()
        ]
        async with trio.open_nursery() as nursery:
            for d in cks:
                nursery.start_soon(image2id, d, partial(settings.STORAGE_IMPL.put, tenant_id=self._canvas._tenant_id), get_uuid())
-        self.set_output("chunks",  cks)
+
+        if custom_pattern:
+            docs = []
+            for c in cks:
+                split_sec = re.split(r"(%s)" % custom_pattern, c["text"], flags=re.DOTALL)
+                if split_sec:
+                    c["mom"] = c["text"]
+                    for txt in split_sec:
+                        cc = deepcopy(c)
+                        cc["text"] = txt
+                        docs.append(cc)
+                else:
+                    docs.append(c)
+            self.set_output("chunks", docs)
+        else:
+            self.set_output("chunks",  cks)
        self.callback(1, "Done.")
--- a/rag/nlp/init.py
+++ b/rag/nlp/init.py
@ -264,14 +264,14 @@ def is_chinese(text):
    return False


-def tokenize(d, t, eng):
-    d["content_with_weight"] = t
-    t = re.sub(r"</?(table|td|caption|tr|th)( [^<>]{0,12})?>", " ", t)
+def tokenize(d, txt, eng):
+    d["content_with_weight"] = txt
+    t = re.sub(r"</?(table|td|caption|tr|th)( [^<>]{0,12})?>", " ", txt)
    d["content_ltks"] = rag_tokenizer.tokenize(t)
    d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"])


-def tokenize_chunks(chunks, doc, eng, pdf_parser=None):
+def tokenize_chunks(chunks, doc, eng, pdf_parser=None, child_delimiters_pattern=None):
    res = []
    # wrap up as es documents
    for ii, ck in enumerate(chunks):
@ -288,12 +288,21 @@ def tokenize_chunks(chunks, doc, eng, pdf_parser=None):
                pass
        else:
            add_positions(d, [[ii]*5])
+
+        if child_delimiters_pattern:
+            d["mom_with_weight"] = ck
+            for txt in re.split(r"(%s)" % child_delimiters_pattern, ck, flags=re.DOTALL):
+                dd = copy.deepcopy(d)
+                tokenize(dd, txt, eng)
+                res.append(dd)
+            continue
+
        tokenize(d, ck, eng)
        res.append(d)
    return res


-def tokenize_chunks_with_images(chunks, doc, eng, images):
+def tokenize_chunks_with_images(chunks, doc, eng, images, child_delimiters_pattern=None):
    res = []
    # wrap up as es documents
    for ii, (ck, image) in enumerate(zip(chunks, images)):
@ -303,6 +312,13 @@ def tokenize_chunks_with_images(chunks, doc, eng, images):
        d = copy.deepcopy(doc)
        d["image"] = image
        add_positions(d, [[ii]*5])
+        if child_delimiters_pattern:
+            d["mom_with_weight"] = ck
+            for txt in re.split(r"(%s)" % child_delimiters_pattern, ck, flags=re.DOTALL):
+                dd = copy.deepcopy(d)
+                tokenize(dd, txt, eng)
+                res.append(dd)
+            continue
        tokenize(d, ck, eng)
        res.append(d)
    return res
--- a/rag/svr/task_executor.py
+++ b/rag/svr/task_executor.py
@ -128,9 +128,6 @@ def signal_handler(sig, frame):
    sys.exit(0)


-
-
-
 def set_progress(task_id, from_page=0, to_page=-1, prog=None, msg="Processing..."):
    try:
        if prog is not None and prog < 0:
@ -720,6 +717,34 @@ async def delete_image(kb_id, chunk_id):


 async def insert_es(task_id, task_tenant_id, task_dataset_id, chunks, progress_callback):
+    mothers = []
+    mother_ids = set([])
+    for ck in chunks:
+        mom = ck.get("mom") or ck.get("mom_with_weight") or ""
+        if not mom:
+            continue
+        id = xxhash.xxh64(mom.encode("utf-8")).hexdigest()
+        if id in mother_ids:
+            continue
+        mother_ids.add(id)
+        ck["mom_id"] = id
+        mom_ck = copy.deepcopy(ck)
+        mom_ck["id"] = id
+        mom_ck["content_with_weight"] = mom
+        mom_ck["available_int"] = 0
+        flds = list(mom_ck.keys())
+        for fld in flds:
+            if fld not in ["id", "content_with_weight", "doc_id", "kb_id", "available_int"]:
+                del mom_ck[fld]
+        mothers.append(mom_ck)
+
+    for b in range(0, len(mothers), settings.DOC_BULK_SIZE):
+        await trio.to_thread.run_sync(lambda: settings.docStoreConn.insert(mothers[b:b + settings.DOC_BULK_SIZE], search.index_name(task_tenant_id), task_dataset_id))
+        task_canceled = has_canceled(task_id)
+        if task_canceled:
+            progress_callback(-1, msg="Task has been canceled.")
+            return False
+
    for b in range(0, len(chunks), settings.DOC_BULK_SIZE):
        doc_store_result = await trio.to_thread.run_sync(lambda: settings.docStoreConn.insert(chunks[b:b + settings.DOC_BULK_SIZE], search.index_name(task_tenant_id), task_dataset_id))
        task_canceled = has_canceled(task_id)