Feat: Redesign and refactor agent module (#9113)

### What problem does this PR solve?

#9082 #6365

<u> **WARNING: it's not compatible with the older version of `Agent`
module, which means that `Agent` from older versions can not work
anymore.**</u>

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
This commit is contained in:
Kevin Hu
2025-07-30 19:41:09 +08:00
committed by GitHub
parent 07e37560fc
commit d9fe279dde
124 changed files with 7744 additions and 18226 deletions

View File

@ -518,7 +518,8 @@ def hierarchical_merge(bull, sections, depth):
return res
def naive_merge(sections, chunk_token_num=128, delimiter="\n。;!?"):
def naive_merge(sections, chunk_token_num=128, delimiter="\n。;!?", overlapped_percent=0):
from deepdoc.parser.pdf_parser import RAGFlowPdfParser
if not sections:
return []
if isinstance(sections[0], type("")):
@ -534,8 +535,10 @@ def naive_merge(sections, chunk_token_num=128, delimiter="\n。"):
if tnum < 8:
pos = ""
# Ensure that the length of the merged chunk does not exceed chunk_token_num
if cks[-1] == "" or tk_nums[-1] > chunk_token_num:
if cks[-1] == "" or tk_nums[-1] > chunk_token_num * (100 - overlapped_percent)/100.:
if cks:
overlapped = RAGFlowPdfParser.remove_tag(cks[-1])
t = overlapped[int(len(overlapped)*(100-overlapped_percent)/100.):] + t
if t.find(pos) < 0:
t += pos
cks.append(t)
@ -548,7 +551,10 @@ def naive_merge(sections, chunk_token_num=128, delimiter="\n。"):
dels = get_delimiters(delimiter)
for sec, pos in sections:
splited_sec = re.split(r"(%s)" % dels, sec)
if num_tokens_from_string(sec) < chunk_token_num:
add_chunk(sec, pos)
continue
splited_sec = re.split(r"(%s)" % dels, sec, flags=re.DOTALL)
for sub_sec in splited_sec:
if re.match(f"^{dels}$", sub_sec):
continue

View File

@ -384,7 +384,7 @@ class Dealer:
zero_vector = [0.0] * dim
sim_np = np.array(sim)
if doc_ids:
similarity_threshold = 0
similarity_threshold = 0
filtered_count = (sim_np >= similarity_threshold).sum()
ranks["total"] = int(filtered_count) # Convert from np.int64 to Python int otherwise JSON serializable error
for i in idx:
@ -403,7 +403,7 @@ class Dealer:
ranks["doc_aggs"][dnm]["count"] += 1
continue
break
position_int = chunk.get("position_int", [])
d = {
"chunk_id": id,