Feat: Redesign and refactor agent module (#9113)

### What problem does this PR solve?

#9082 #6365

<u> **WARNING: it's not compatible with the older version of `Agent`
module, which means that `Agent` from older versions can not work
anymore.**</u>

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
This commit is contained in:
Kevin Hu
2025-07-30 19:41:09 +08:00
committed by GitHub
parent 07e37560fc
commit d9fe279dde
124 changed files with 7744 additions and 18226 deletions

View File

@ -518,7 +518,8 @@ def hierarchical_merge(bull, sections, depth):
return res
def naive_merge(sections, chunk_token_num=128, delimiter="\n。;!?"):
def naive_merge(sections, chunk_token_num=128, delimiter="\n。;!?", overlapped_percent=0):
from deepdoc.parser.pdf_parser import RAGFlowPdfParser
if not sections:
return []
if isinstance(sections[0], type("")):
@ -534,8 +535,10 @@ def naive_merge(sections, chunk_token_num=128, delimiter="\n。"):
if tnum < 8:
pos = ""
# Ensure that the length of the merged chunk does not exceed chunk_token_num
if cks[-1] == "" or tk_nums[-1] > chunk_token_num:
if cks[-1] == "" or tk_nums[-1] > chunk_token_num * (100 - overlapped_percent)/100.:
if cks:
overlapped = RAGFlowPdfParser.remove_tag(cks[-1])
t = overlapped[int(len(overlapped)*(100-overlapped_percent)/100.):] + t
if t.find(pos) < 0:
t += pos
cks.append(t)
@ -548,7 +551,10 @@ def naive_merge(sections, chunk_token_num=128, delimiter="\n。"):
dels = get_delimiters(delimiter)
for sec, pos in sections:
splited_sec = re.split(r"(%s)" % dels, sec)
if num_tokens_from_string(sec) < chunk_token_num:
add_chunk(sec, pos)
continue
splited_sec = re.split(r"(%s)" % dels, sec, flags=re.DOTALL)
for sub_sec in splited_sec:
if re.match(f"^{dels}$", sub_sec):
continue