From cf9611c96f175ce5a46d19db13f70019ab4b81e0 Mon Sep 17 00:00:00 2001 From: Billy Bao Date: Wed, 5 Nov 2025 13:00:42 +0800 Subject: [PATCH] Feat: Support more chunking methods (#11000) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### What problem does this PR solve? Feat: Support more chunking methods #10772 This PR enables multiple chunking methods — including books, laws, naive, one, and presentation — to be used with all existing PDF parsers (DeepDOC, MinerU, Docling, TCADP, Plain Text, and Vision modes). ### Type of change - [x] New Feature (non-breaking change which adds functionality) --- rag/app/book.py | 38 ++++++-- rag/app/laws.py | 38 ++++++-- rag/app/manual.py | 41 ++++++--- rag/app/naive.py | 192 ++++++++++++++++++++++++---------------- rag/app/one.py | 41 ++++++--- rag/app/presentation.py | 47 ++++++---- 6 files changed, 264 insertions(+), 133 deletions(-) diff --git a/rag/app/book.py b/rag/app/book.py index 319e7a2b9..b6615ead9 100644 --- a/rag/app/book.py +++ b/rag/app/book.py @@ -15,18 +15,18 @@ # import logging -from tika import parser import re from io import BytesIO from deepdoc.parser.utils import get_text from rag.app import naive +from rag.app.naive import plaintext_parser, PARSERS from rag.nlp import bullets_category, is_english,remove_contents_table, \ hierarchical_merge, make_colon_as_title, naive_merge, random_choices, tokenize_table, \ tokenize_chunks from rag.nlp import rag_tokenizer -from deepdoc.parser import PdfParser, PlainParser, HtmlParser -from deepdoc.parser.figure_parser import vision_figure_parser_pdf_wrapper,vision_figure_parser_docx_wrapper +from deepdoc.parser import PdfParser, HtmlParser +from deepdoc.parser.figure_parser import vision_figure_parser_docx_wrapper from PIL import Image @@ -96,13 +96,33 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, callback(0.8, "Finish parsing.") elif re.search(r"\.pdf$", filename, re.IGNORECASE): - pdf_parser = Pdf() - if parser_config.get("layout_recognize", "DeepDOC") == "Plain Text": - pdf_parser = PlainParser() - sections, tbls = pdf_parser(filename if not binary else binary, - from_page=from_page, to_page=to_page, callback=callback) - tbls=vision_figure_parser_pdf_wrapper(tbls=tbls,callback=callback,**kwargs) + layout_recognizer = parser_config.get("layout_recognize", "DeepDOC") + if isinstance(layout_recognizer, bool): + layout_recognizer = "DeepDOC" if layout_recognizer else "Plain Text" + + name = layout_recognizer.strip().lower() + parser = PARSERS.get(name, plaintext_parser) + callback(0.1, "Start to parse.") + + sections, tables, _ = parser( + filename = filename, + binary = binary, + from_page = from_page, + to_page = to_page, + lang = lang, + callback = callback, + pdf_cls = Pdf, + **kwargs + ) + + if not sections and not tables: + return [] + + if name in ["tcadp", "docling", "mineru"]: + parser_config["chunk_token_num"] = 0 + + callback(0.8, "Finish parsing.") elif re.search(r"\.txt$", filename, re.IGNORECASE): callback(0.1, "Start to parse.") txt = get_text(filename, binary) diff --git a/rag/app/laws.py b/rag/app/laws.py index 1269ee9d8..5da63d354 100644 --- a/rag/app/laws.py +++ b/rag/app/laws.py @@ -15,7 +15,6 @@ # import logging -from tika import parser import re from io import BytesIO from docx import Document @@ -25,8 +24,8 @@ from deepdoc.parser.utils import get_text from rag.nlp import bullets_category, remove_contents_table, \ make_colon_as_title, tokenize_chunks, docx_question_level, tree_merge from rag.nlp import rag_tokenizer, Node -from deepdoc.parser import PdfParser, DocxParser, PlainParser, HtmlParser - +from deepdoc.parser import PdfParser, DocxParser, HtmlParser +from rag.app.naive import plaintext_parser, PARSERS @@ -156,13 +155,36 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, return tokenize_chunks(chunks, doc, eng, None) elif re.search(r"\.pdf$", filename, re.IGNORECASE): - pdf_parser = Pdf() - if parser_config.get("layout_recognize", "DeepDOC") == "Plain Text": - pdf_parser = PlainParser() - for txt, poss in pdf_parser(filename if not binary else binary, - from_page=from_page, to_page=to_page, callback=callback)[0]: + layout_recognizer = parser_config.get("layout_recognize", "DeepDOC") + + if isinstance(layout_recognizer, bool): + layout_recognizer = "DeepDOC" if layout_recognizer else "Plain Text" + + name = layout_recognizer.strip().lower() + parser = PARSERS.get(name, plaintext_parser) + callback(0.1, "Start to parse.") + + raw_sections, tables, _ = parser( + filename = filename, + binary = binary, + from_page = from_page, + to_page = to_page, + lang = lang, + callback = callback, + pdf_cls = Pdf, + **kwargs + ) + + if not raw_sections and not tables: + return [] + + if name in ["tcadp", "docling", "mineru"]: + parser_config["chunk_token_num"] = 0 + + for txt, poss in raw_sections: sections.append(txt + poss) + callback(0.8, "Finish parsing.") elif re.search(r"\.(txt|md|markdown|mdx)$", filename, re.IGNORECASE): callback(0.1, "Start to parse.") txt = get_text(filename, binary) diff --git a/rag/app/manual.py b/rag/app/manual.py index cf97bfcb9..b1e66e7b8 100644 --- a/rag/app/manual.py +++ b/rag/app/manual.py @@ -22,11 +22,11 @@ from common.constants import ParserType from io import BytesIO from rag.nlp import rag_tokenizer, tokenize, tokenize_table, bullets_category, title_frequency, tokenize_chunks, docx_question_level from common.token_utils import num_tokens_from_string -from deepdoc.parser import PdfParser, PlainParser, DocxParser +from deepdoc.parser import PdfParser, DocxParser from deepdoc.parser.figure_parser import vision_figure_parser_pdf_wrapper,vision_figure_parser_docx_wrapper from docx import Document from PIL import Image - +from rag.app.naive import plaintext_parser, PARSERS class Pdf(PdfParser): def __init__(self): @@ -196,15 +196,34 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, # is it English eng = lang.lower() == "english" # pdf_parser.is_english if re.search(r"\.pdf$", filename, re.IGNORECASE): - pdf_parser = Pdf() - if parser_config.get("layout_recognize", "DeepDOC") == "Plain Text": - pdf_parser = PlainParser() - sections, tbls = pdf_parser(filename if not binary else binary, - from_page=from_page, to_page=to_page, callback=callback) - if sections and len(sections[0]) < 3: - sections = [(t, lvl, [[0] * 5]) for t, lvl in sections] - # set pivot using the most frequent type of title, - # then merge between 2 pivot + layout_recognizer = parser_config.get("layout_recognize", "DeepDOC") + + if isinstance(layout_recognizer, bool): + layout_recognizer = "DeepDOC" if layout_recognizer else "Plain Text" + + name = layout_recognizer.strip().lower() + pdf_parser = PARSERS.get(name, plaintext_parser) + callback(0.1, "Start to parse.") + + sections, tbls, pdf_parser = pdf_parser( + filename = filename, + binary = binary, + from_page = from_page, + to_page = to_page, + lang = lang, + callback = callback, + pdf_cls = Pdf, + **kwargs + ) + + if not sections and not tbls: + return [] + + if name in ["tcadp", "docling", "mineru"]: + parser_config["chunk_token_num"] = 0 + + callback(0.8, "Finish parsing.") + if len(sections) > 0 and len(pdf_parser.outlines) / len(sections) > 0.03: max_lvl = max([lvl for _, lvl in pdf_parser.outlines]) most_level = max(0, max_lvl - 1) diff --git a/rag/app/naive.py b/rag/app/naive.py index 12af18c0f..f2bfd565b 100644 --- a/rag/app/naive.py +++ b/rag/app/naive.py @@ -26,7 +26,6 @@ from docx.opc.pkgreader import _SerializedRelationships, _SerializedRelationship from docx.opc.oxml import parse_xml from markdown import markdown from PIL import Image -from tika import parser from common.constants import LLMType from api.db.services.llm_service import LLMBundle @@ -39,6 +38,100 @@ from deepdoc.parser.docling_parser import DoclingParser from deepdoc.parser.tcadp_parser import TCADPParser from rag.nlp import concat_img, find_codec, naive_merge, naive_merge_with_images, naive_merge_docx, rag_tokenizer, tokenize_chunks, tokenize_chunks_with_images, tokenize_table +def DeepDOC_parser(filename, binary=None, from_page=0, to_page=100000, callback=None, pdf_cls = None ,**kwargs): + callback = callback + binary = binary + pdf_parser = pdf_cls() if pdf_cls else Pdf() + sections, tables = pdf_parser( + filename if not binary else binary, + from_page=from_page, + to_page=to_page, + callback=callback + ) + tables = vision_figure_parser_pdf_wrapper(tbls=tables, + callback=callback, + **kwargs) + return sections, tables, pdf_parser + + +def MinerU_parser(filename, binary=None, callback=None, **kwargs): + mineru_executable = os.environ.get("MINERU_EXECUTABLE", "mineru") + mineru_api = os.environ.get("MINERU_APISERVER", "http://host.docker.internal:9987") + pdf_parser = MinerUParser(mineru_path=mineru_executable, mineru_api=mineru_api) + + if not pdf_parser.check_installation(): + callback(-1, "MinerU not found.") + return None, None + + sections, tables = pdf_parser.parse_pdf( + filepath=filename, + binary=binary, + callback=callback, + output_dir=os.environ.get("MINERU_OUTPUT_DIR", ""), + backend=os.environ.get("MINERU_BACKEND", "pipeline"), + delete_output=bool(int(os.environ.get("MINERU_DELETE_OUTPUT", 1))), + ) + return sections, tables, pdf_parser + + +def Docling_parser(filename, binary=None, callback=None, **kwargs): + pdf_parser = DoclingParser() + + if not pdf_parser.check_installation(): + callback(-1, "Docling not found.") + return None, None + + sections, tables = pdf_parser.parse_pdf( + filepath=filename, + binary=binary, + callback=callback, + output_dir=os.environ.get("MINERU_OUTPUT_DIR", ""), + delete_output=bool(int(os.environ.get("MINERU_DELETE_OUTPUT", 1))), + ) + return sections, tables, pdf_parser + + +def TCADP_parser(filename, binary=None, callback=None, **kwargs): + tcadp_parser = TCADPParser() + + if not tcadp_parser.check_installation(): + callback(-1, "TCADP parser not available. Please check Tencent Cloud API configuration.") + return None, None + + sections, tables = tcadp_parser.parse_pdf( + filepath=filename, + binary=binary, + callback=callback, + output_dir=os.environ.get("TCADP_OUTPUT_DIR", ""), + file_type="PDF" + ) + return sections, tables, tcadp_parser + + +def plaintext_parser(filename, binary=None, from_page=0, to_page=100000, callback=None, **kwargs): + if kwargs.get("layout_recognizer", "") == "Plain Text": + pdf_parser = PlainParser() + else: + vision_model = LLMBundle(kwargs["tenant_id"], LLMType.IMAGE2TEXT, llm_name=kwargs.get("layout_recognizer", ""), lang=kwargs.get("lang", "Chinese")) + pdf_parser = VisionParser(vision_model=vision_model, **kwargs) + + sections, tables = pdf_parser( + filename if not binary else binary, + from_page=from_page, + to_page=to_page, + callback=callback + ) + return sections, tables, pdf_parser + + +PARSERS = { + "deepdoc": DeepDOC_parser, + "mineru": MinerU_parser, + "docling": Docling_parser, + "tcadp": TCADP_parser, + "plaintext": plaintext_parser, # default +} + class Docx(DocxParser): def __init__(self): @@ -365,7 +458,7 @@ class Markdown(MarkdownParser): html_content = markdown(text) soup = BeautifulSoup(html_content, 'html.parser') return soup - + def get_picture_urls(self, soup): if soup: return [img.get('src') for img in soup.find_all('img') if img.get('src')] @@ -375,7 +468,7 @@ class Markdown(MarkdownParser): if soup: return set([a.get('href') for a in soup.find_all('a') if a.get('href')]) return [] - + def get_pictures(self, text): """Download and open all images from markdown text.""" import requests @@ -535,82 +628,29 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, if isinstance(layout_recognizer, bool): layout_recognizer = "DeepDOC" if layout_recognizer else "Plain Text" + + name = layout_recognizer.strip().lower() + parser = PARSERS.get(name, plaintext_parser) callback(0.1, "Start to parse.") - if layout_recognizer == "DeepDOC": - pdf_parser = Pdf() - sections, tables = pdf_parser(filename if not binary else binary, from_page=from_page, to_page=to_page, callback=callback) - tables=vision_figure_parser_pdf_wrapper(tbls=tables,callback=callback,**kwargs) + sections, tables, _ = parser( + filename = filename, + binary = binary, + from_page = from_page, + to_page = to_page, + lang = lang, + callback = callback, + **kwargs + ) - res = tokenize_table(tables, doc, is_english) - callback(0.8, "Finish parsing.") + if not sections and not tables: + return [] - elif layout_recognizer == "MinerU": - mineru_executable = os.environ.get("MINERU_EXECUTABLE", "mineru") - mineru_api = os.environ.get("MINERU_APISERVER", "http://host.docker.internal:9987") - mineru_server_url = os.environ.get("MINERU_SERVER_URL", "") - mineru_backend = os.environ.get("MINERU_BACKEND", "pipeline") - pdf_parser = MinerUParser(mineru_path=mineru_executable, mineru_api=mineru_api, mineru_server_url=mineru_server_url) - ok, reason = pdf_parser.check_installation(backend=mineru_backend) - if not ok: - callback(-1, f"MinerU not found or server not accessible: {reason}") - return res - - sections, tables = pdf_parser.parse_pdf( - filepath=filename, - binary=binary, - callback=callback, - output_dir=os.environ.get("MINERU_OUTPUT_DIR", ""), - backend=mineru_backend, - server_url=mineru_server_url, - delete_output=bool(int(os.environ.get("MINERU_DELETE_OUTPUT", 1))), - ) + if name in ["tcadp", "docling", "mineru"]: parser_config["chunk_token_num"] = 0 - callback(0.8, "Finish parsing.") - - elif layout_recognizer == "Docling": - pdf_parser = DoclingParser() - if not pdf_parser.check_installation(): - callback(-1, "Docling not found.") - return res - - sections, tables = pdf_parser.parse_pdf( - filepath=filename, - binary=binary, - callback=callback, - output_dir=os.environ.get("MINERU_OUTPUT_DIR", ""), - delete_output=bool(int(os.environ.get("MINERU_DELETE_OUTPUT", 1))), - ) - parser_config["chunk_token_num"] = 0 - res = tokenize_table(tables, doc, is_english) - callback(0.8, "Finish parsing.") - - elif layout_recognizer == "TCADP Parser": - tcadp_parser = TCADPParser() - if not tcadp_parser.check_installation(): - callback(-1, "TCADP parser not available. Please check Tencent Cloud API configuration.") - return res - - sections, tables = tcadp_parser.parse_pdf( - filepath=filename, - binary=binary, - callback=callback, - output_dir=os.environ.get("TCADP_OUTPUT_DIR", ""), - file_type="PDF" - ) - parser_config["chunk_token_num"] = 0 - callback(0.8, "Finish parsing.") - else: - if layout_recognizer == "Plain Text": - pdf_parser = PlainParser() - else: - vision_model = LLMBundle(kwargs["tenant_id"], LLMType.IMAGE2TEXT, llm_name=layout_recognizer, lang=lang) - pdf_parser = VisionParser(vision_model=vision_model, **kwargs) - - sections, tables = pdf_parser(filename if not binary else binary, from_page=from_page, to_page=to_page, - callback=callback) - res = tokenize_table(tables, doc, is_english) - callback(0.8, "Finish parsing.") + + res = tokenize_table(tables, doc, is_english) + callback(0.8, "Finish parsing.") elif re.search(r"\.(csv|xlsx?)$", filename, re.IGNORECASE): callback(0.1, "Start to parse.") @@ -735,9 +775,9 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, logging.info(f"Failed to chunk url in registered file type {url}: {e}") sub_url_res = chunk(f"{index}.html", html_bytes, callback=callback, lang=lang, is_root=False, **kwargs) url_res.extend(sub_url_res) - + logging.info("naive_merge({}): {}".format(filename, timer() - st)) - + if embed_res: res.extend(embed_res) if url_res: diff --git a/rag/app/one.py b/rag/app/one.py index bb86b80fc..e4016118d 100644 --- a/rag/app/one.py +++ b/rag/app/one.py @@ -15,16 +15,15 @@ # import logging -from tika import parser from io import BytesIO import re from deepdoc.parser.utils import get_text from rag.app import naive from rag.nlp import rag_tokenizer, tokenize -from deepdoc.parser import PdfParser, ExcelParser, PlainParser, HtmlParser -from deepdoc.parser.figure_parser import vision_figure_parser_pdf_wrapper,vision_figure_parser_docx_wrapper - +from deepdoc.parser import PdfParser, ExcelParser, HtmlParser +from deepdoc.parser.figure_parser import vision_figure_parser_docx_wrapper +from rag.app.naive import plaintext_parser, PARSERS class Pdf(PdfParser): def __call__(self, filename, binary=None, from_page=0, @@ -83,12 +82,34 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, callback(0.8, "Finish parsing.") elif re.search(r"\.pdf$", filename, re.IGNORECASE): - pdf_parser = Pdf() - if parser_config.get("layout_recognize", "DeepDOC") == "Plain Text": - pdf_parser = PlainParser() - sections, tbls = pdf_parser( - filename if not binary else binary, to_page=to_page, callback=callback) - tbls=vision_figure_parser_pdf_wrapper(tbls=tbls,callback=callback,**kwargs) + layout_recognizer = parser_config.get("layout_recognize", "DeepDOC") + + if isinstance(layout_recognizer, bool): + layout_recognizer = "DeepDOC" if layout_recognizer else "Plain Text" + + name = layout_recognizer.strip().lower() + parser = PARSERS.get(name, plaintext_parser) + callback(0.1, "Start to parse.") + + sections, tbls, _ = parser( + filename = filename, + binary = binary, + from_page = from_page, + to_page = to_page, + lang = lang, + callback = callback, + pdf_cls = Pdf, + **kwargs + ) + + if not sections and not tbls: + return [] + + if name in ["tcadp", "docling", "mineru"]: + parser_config["chunk_token_num"] = 0 + + callback(0.8, "Finish parsing.") + for (img, rows), poss in tbls: if not rows: continue diff --git a/rag/app/presentation.py b/rag/app/presentation.py index 8f168e1c4..497a7b1cb 100644 --- a/rag/app/presentation.py +++ b/rag/app/presentation.py @@ -20,14 +20,11 @@ from io import BytesIO from PIL import Image -from common.constants import LLMType -from api.db.services.llm_service import LLMBundle -from deepdoc.parser.pdf_parser import VisionParser from rag.nlp import tokenize, is_english from rag.nlp import rag_tokenizer from deepdoc.parser import PdfParser, PptParser, PlainParser from PyPDF2 import PdfReader as pdf2_read - +from rag.app.naive import plaintext_parser, PARSERS class Ppt(PptParser): def __call__(self, fnm, from_page, to_page, callback=None): @@ -54,7 +51,6 @@ class Ppt(PptParser): self.is_english = is_english(txts) return [(txts[i], imgs[i]) for i in range(len(txts))] - class Pdf(PdfParser): def __init__(self): super().__init__() @@ -84,7 +80,7 @@ class Pdf(PdfParser): res.append((lines, self.page_images[i])) callback(0.9, "Page {}~{}: Parsing finished".format( from_page, min(to_page, self.total_page))) - return res + return res, [] class PlainPdf(PlainParser): @@ -95,7 +91,7 @@ class PlainPdf(PlainParser): for page in self.pdf.pages[from_page: to_page]: page_txt.append(page.extract_text()) callback(0.9, "Parsing finished") - return [(txt, None) for txt in page_txt] + return [(txt, None) for txt in page_txt], [] def chunk(filename, binary=None, from_page=0, to_page=100000, @@ -130,20 +126,33 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, return res elif re.search(r"\.pdf$", filename, re.IGNORECASE): layout_recognizer = parser_config.get("layout_recognize", "DeepDOC") - if layout_recognizer == "DeepDOC": - pdf_parser = Pdf() - sections = pdf_parser(filename, binary, from_page=from_page, to_page=to_page, callback=callback) - elif layout_recognizer == "Plain Text": - pdf_parser = PlainParser() - sections, _ = pdf_parser(filename if not binary else binary, from_page=from_page, to_page=to_page, - callback=callback) - else: - vision_model = LLMBundle(kwargs["tenant_id"], LLMType.IMAGE2TEXT, llm_name=layout_recognizer, lang=lang) - pdf_parser = VisionParser(vision_model=vision_model, **kwargs) - sections, _ = pdf_parser(filename if not binary else binary, from_page=from_page, to_page=to_page, - callback=callback) + if isinstance(layout_recognizer, bool): + layout_recognizer = "DeepDOC" if layout_recognizer else "Plain Text" + + name = layout_recognizer.strip().lower() + parser = PARSERS.get(name, plaintext_parser) + callback(0.1, "Start to parse.") + + sections, _, _ = parser( + filename = filename, + binary = binary, + from_page = from_page, + to_page = to_page, + lang = lang, + callback = callback, + pdf_cls = Pdf, + **kwargs + ) + + if not sections: + return [] + + if name in ["tcadp", "docling", "mineru"]: + parser_config["chunk_token_num"] = 0 + callback(0.8, "Finish parsing.") + for pn, (txt, img) in enumerate(sections): d = copy.deepcopy(doc) pn += from_page