Feat: Redesign and refactor agent module (#9113)

### What problem does this PR solve? #9082 #6365 <u> **WARNING: it's not compatible with the older version of `Agent` module, which means that `Agent` from older versions can not work anymore.**</u> ### Type of change - [x] New Feature (non-breaking change which adds functionality)
2026-02-02 00:25:06 +08:00 · 2025-07-30 19:41:09 +08:00
parent 07e37560fc
commit d9fe279dde
124 changed files with 7744 additions and 18226 deletions
--- a/rag/app/manual.py
+++ b/rag/app/manual.py
@ -45,9 +45,6 @@ class Pdf(PdfParser):
            callback
        )
        callback(msg="OCR finished ({:.2f}s)".format(timer() - start))
-        # for bb in self.boxes:
-        #    for b in bb:
-        #        print(b)
        logging.debug("OCR: {}".format(timer() - start))

        start = timer()
@ -177,6 +174,9 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
    """
        Only pdf is supported.
    """
+    parser_config = kwargs.get(
+        "parser_config", {
+            "chunk_token_num": 512, "delimiter": "\n!?。；！？", "layout_recognize": "DeepDOC"})
    pdf_parser = None
    doc = {
        "docnm_kwd": filename
@ -187,7 +187,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
    eng = lang.lower() == "english"  # pdf_parser.is_english
    if re.search(r"\.pdf$", filename, re.IGNORECASE):
        pdf_parser = Pdf()
-        if kwargs.get("layout_recognize", "DeepDOC") == "Plain Text":
+        if parser_config.get("layout_recognize", "DeepDOC") == "Plain Text":
            pdf_parser = PlainParser()
        sections, tbls = pdf_parser(filename if not binary else binary,
                                    from_page=from_page, to_page=to_page, callback=callback)
@ -222,7 +222,6 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
            if lvl <= most_level and i > 0 and lvl != levels[i - 1]:
                sid += 1
            sec_ids.append(sid)
-            # print(lvl, self.boxes[i]["text"], most_level, sid)

        sections = [(txt, sec_ids[i], poss)
                    for i, (txt, _, poss) in enumerate(sections)]