Feat: Redesign and refactor agent module (#9113)

### What problem does this PR solve? #9082 #6365 <u> **WARNING: it's not compatible with the older version of `Agent` module, which means that `Agent` from older versions can not work anymore.**</u> ### Type of change - [x] New Feature (non-breaking change which adds functionality)
2026-02-01 08:05:07 +08:00 · 2025-07-30 19:41:09 +08:00
parent 07e37560fc
commit d9fe279dde
124 changed files with 7744 additions and 18226 deletions
--- a/rag/nlp/init.py
+++ b/rag/nlp/init.py
@ -518,7 +518,8 @@ def hierarchical_merge(bull, sections, depth):
    return res


-def naive_merge(sections, chunk_token_num=128, delimiter="\n。；！？"):
+def naive_merge(sections, chunk_token_num=128, delimiter="\n。；！？", overlapped_percent=0):
+    from deepdoc.parser.pdf_parser import RAGFlowPdfParser
    if not sections:
        return []
    if isinstance(sections[0], type("")):
@ -534,8 +535,10 @@ def naive_merge(sections, chunk_token_num=128, delimiter="\n。；！？"):
        if tnum < 8:
            pos = ""
        # Ensure that the length of the merged chunk does not exceed chunk_token_num  
-        if cks[-1] == "" or tk_nums[-1] > chunk_token_num:
-
+        if cks[-1] == "" or tk_nums[-1] > chunk_token_num * (100 - overlapped_percent)/100.:
+            if cks:
+                overlapped = RAGFlowPdfParser.remove_tag(cks[-1])
+                t = overlapped[int(len(overlapped)*(100-overlapped_percent)/100.):] + t
            if t.find(pos) < 0:
                t += pos
            cks.append(t)
@ -548,7 +551,10 @@ def naive_merge(sections, chunk_token_num=128, delimiter="\n。；！？"):

    dels = get_delimiters(delimiter)
    for sec, pos in sections:
-        splited_sec = re.split(r"(%s)" % dels, sec)
+        if num_tokens_from_string(sec) < chunk_token_num:
+            add_chunk(sec, pos)
+            continue
+        splited_sec = re.split(r"(%s)" % dels, sec, flags=re.DOTALL)
        for sub_sec in splited_sec:
            if re.match(f"^{dels}$", sub_sec):
                continue
--- a/rag/nlp/search.py
+++ b/rag/nlp/search.py
@ -384,7 +384,7 @@ class Dealer:
        zero_vector = [0.0] * dim
        sim_np = np.array(sim)
        if doc_ids:
-            similarity_threshold = 0 
+            similarity_threshold = 0
        filtered_count = (sim_np >= similarity_threshold).sum()    
        ranks["total"] = int(filtered_count) # Convert from np.int64 to Python int otherwise JSON serializable error
        for i in idx:
@ -403,7 +403,7 @@ class Dealer:
                    ranks["doc_aggs"][dnm]["count"] += 1
                    continue
                break
-                
+
            position_int = chunk.get("position_int", [])
            d = {
                "chunk_id": id,