Feat: Support more chunking methods (#11000)

### What problem does this PR solve? Feat: Support more chunking methods #10772 This PR enables multiple chunking methods — including books, laws, naive, one, and presentation — to be used with all existing PDF parsers (DeepDOC, MinerU, Docling, TCADP, Plain Text, and Vision modes). ### Type of change - [x] New Feature (non-breaking change which adds functionality)
2026-01-31 15:45:08 +08:00 · 2025-11-05 13:00:42 +08:00
parent f126875ec6
commit cf9611c96f
6 changed files with 264 additions and 133 deletions
--- a/rag/app/presentation.py
+++ b/rag/app/presentation.py
@ -20,14 +20,11 @@ from io import BytesIO

 from PIL import Image

-from common.constants import LLMType
-from api.db.services.llm_service import LLMBundle
-from deepdoc.parser.pdf_parser import VisionParser
 from rag.nlp import tokenize, is_english
 from rag.nlp import rag_tokenizer
 from deepdoc.parser import PdfParser, PptParser, PlainParser
 from PyPDF2 import PdfReader as pdf2_read
-
+from rag.app.naive import plaintext_parser, PARSERS

 class Ppt(PptParser):
    def __call__(self, fnm, from_page, to_page, callback=None):
@ -54,7 +51,6 @@ class Ppt(PptParser):
        self.is_english = is_english(txts)
        return [(txts[i], imgs[i]) for i in range(len(txts))]

-
 class Pdf(PdfParser):
    def __init__(self):
        super().__init__()
@ -84,7 +80,7 @@ class Pdf(PdfParser):
            res.append((lines, self.page_images[i]))
        callback(0.9, "Page {}~{}: Parsing finished".format(
            from_page, min(to_page, self.total_page)))
-        return res
+        return res, []


 class PlainPdf(PlainParser):
@ -95,7 +91,7 @@ class PlainPdf(PlainParser):
        for page in self.pdf.pages[from_page: to_page]:
            page_txt.append(page.extract_text())
        callback(0.9, "Parsing finished")
-        return [(txt, None) for txt in page_txt]
+        return [(txt, None) for txt in page_txt], []


 def chunk(filename, binary=None, from_page=0, to_page=100000,
@ -130,20 +126,33 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
        return res
    elif re.search(r"\.pdf$", filename, re.IGNORECASE):
        layout_recognizer = parser_config.get("layout_recognize", "DeepDOC")
-        if layout_recognizer == "DeepDOC":
-            pdf_parser = Pdf()
-            sections = pdf_parser(filename, binary, from_page=from_page, to_page=to_page, callback=callback)
-        elif layout_recognizer == "Plain Text":
-            pdf_parser = PlainParser()
-            sections, _ = pdf_parser(filename if not binary else binary, from_page=from_page, to_page=to_page,
-                                      callback=callback)
-        else:
-            vision_model = LLMBundle(kwargs["tenant_id"], LLMType.IMAGE2TEXT, llm_name=layout_recognizer, lang=lang)
-            pdf_parser = VisionParser(vision_model=vision_model, **kwargs)
-            sections, _ = pdf_parser(filename if not binary else binary, from_page=from_page, to_page=to_page,
-                                      callback=callback)

+        if isinstance(layout_recognizer, bool):
+            layout_recognizer = "DeepDOC" if layout_recognizer else "Plain Text"
+
+        name = layout_recognizer.strip().lower()
+        parser = PARSERS.get(name, plaintext_parser)
+        callback(0.1, "Start to parse.")
+
+        sections, _, _ = parser(
+            filename = filename,
+            binary = binary,
+            from_page = from_page,
+            to_page = to_page,
+            lang = lang,
+            callback = callback,
+            pdf_cls = Pdf,
+            **kwargs
+        )
+
+        if not sections:
+            return []
+
+        if name in ["tcadp", "docling", "mineru"]:
+            parser_config["chunk_token_num"] = 0
+        
        callback(0.8, "Finish parsing.")
+
        for pn, (txt, img) in enumerate(sections):
            d = copy.deepcopy(doc)
            pn += from_page