Feat: debugging toc part. (#10486)

### What problem does this PR solve? #10436 ### Type of change - [x] New Feature (non-breaking change which adds functionality)
2026-01-31 15:45:08 +08:00 · 2025-10-11 18:45:21 +08:00
parent a0d5f81098
commit 7d2f65671f
6 changed files with 32 additions and 24 deletions
--- a/rag/nlp/init.py
+++ b/rag/nlp/init.py
@ -613,13 +613,13 @@ def naive_merge(sections: str | list, chunk_token_num=128, delimiter="\n。；
    dels = get_delimiters(delimiter)
    for sec, pos in sections:
        if num_tokens_from_string(sec) < chunk_token_num:
-            add_chunk(sec, pos)
+            add_chunk("\n"+sec, pos)
            continue
        split_sec = re.split(r"(%s)" % dels, sec, flags=re.DOTALL)
        for sub_sec in split_sec:
            if re.match(f"^{dels}$", sub_sec):
                continue
-            add_chunk(sub_sec, pos)
+            add_chunk("\n"+sub_sec, pos)

    return cks

@ -669,13 +669,13 @@ def naive_merge_with_images(texts, images, chunk_token_num=128, delimiter="\n。
            for sub_sec in split_sec:
                if re.match(f"^{dels}$", sub_sec):
                    continue
-                add_chunk(sub_sec, image, text_pos)
+                add_chunk("\n"+sub_sec, image, text_pos)
        else:
            split_sec = re.split(r"(%s)" % dels, text)
            for sub_sec in split_sec:
                if re.match(f"^{dels}$", sub_sec):
                    continue
-                add_chunk(sub_sec, image)
+                add_chunk("\n"+sub_sec, image)

    return cks, result_images

@ -757,7 +757,7 @@ def naive_merge_docx(sections, chunk_token_num=128, delimiter="\n。；！？"):
        for sub_sec in split_sec:
            if re.match(f"^{dels}$", sub_sec):
                continue
-            add_chunk(sub_sec, image,"")
+            add_chunk("\n"+sub_sec, image,"")
        line = ""

    if line:
@ -765,7 +765,7 @@ def naive_merge_docx(sections, chunk_token_num=128, delimiter="\n。；！？"):
        for sub_sec in split_sec:
            if re.match(f"^{dels}$", sub_sec):
                continue
-            add_chunk(sub_sec, image,"")
+            add_chunk("\n"+sub_sec, image,"")

    return cks, images