From d3d2ccc76c1078dd401707eb6b3bf6db51200d1d Mon Sep 17 00:00:00 2001 From: Billy Bao Date: Thu, 20 Nov 2025 19:07:17 +0800 Subject: [PATCH] Feat: add more chunking method (#11413) ### What problem does this PR solve? Feat: add more chunking method #11311 ### Type of change - [x] New Feature (non-breaking change which adds functionality) --- deepdoc/parser/docling_parser.py | 14 +++++++---- deepdoc/parser/mineru_parser.py | 12 +++++++--- docker/.env | 9 ++++++- rag/app/manual.py | 5 ++-- rag/app/naive.py | 4 ++++ rag/app/paper.py | 41 +++++++++++++++++++++++++------- 6 files changed, 66 insertions(+), 19 deletions(-) diff --git a/deepdoc/parser/docling_parser.py b/deepdoc/parser/docling_parser.py index 59fec9250..965f82265 100644 --- a/deepdoc/parser/docling_parser.py +++ b/deepdoc/parser/docling_parser.py @@ -187,7 +187,7 @@ class DoclingParser(RAGFlowPdfParser): bbox = _BBox(int(pn), bb[0], bb[1], bb[2], bb[3]) yield (DoclingContentType.EQUATION.value, text, bbox) - def _transfer_to_sections(self, doc) -> list[tuple[str, str]]: + def _transfer_to_sections(self, doc, parse_method: str) -> list[tuple[str, str]]: sections: list[tuple[str, str]] = [] for typ, payload, bbox in self._iter_doc_items(doc): if typ == DoclingContentType.TEXT.value: @@ -200,7 +200,12 @@ class DoclingParser(RAGFlowPdfParser): continue tag = self._make_line_tag(bbox) if isinstance(bbox,_BBox) else "" - sections.append((section, tag)) + if parse_method == "manual": + sections.append((section, typ, tag)) + elif parse_method == "paper": + sections.append((section + tag, typ)) + else: + sections.append((section, tag)) return sections def cropout_docling_table(self, page_no: int, bbox: tuple[float, float, float, float], zoomin: int = 1): @@ -282,7 +287,8 @@ class DoclingParser(RAGFlowPdfParser): output_dir: Optional[str] = None, lang: Optional[str] = None, method: str = "auto", - delete_output: bool = True, + delete_output: bool = True, + parse_method: str = "raw" ): if not self.check_installation(): @@ -318,7 +324,7 @@ class DoclingParser(RAGFlowPdfParser): if callback: callback(0.7, f"[Docling] Parsed doc: {getattr(doc, 'num_pages', 'n/a')} pages") - sections = self._transfer_to_sections(doc) + sections = self._transfer_to_sections(doc, parse_method=parse_method) tables = self._transfer_to_tables(doc) if callback: diff --git a/deepdoc/parser/mineru_parser.py b/deepdoc/parser/mineru_parser.py index d2b694188..d4834de39 100644 --- a/deepdoc/parser/mineru_parser.py +++ b/deepdoc/parser/mineru_parser.py @@ -476,7 +476,7 @@ class MinerUParser(RAGFlowPdfParser): item[key] = str((subdir / item[key]).resolve()) return data - def _transfer_to_sections(self, outputs: list[dict[str, Any]]): + def _transfer_to_sections(self, outputs: list[dict[str, Any]], parse_method: str = None): sections = [] for output in outputs: match output["type"]: @@ -497,7 +497,11 @@ class MinerUParser(RAGFlowPdfParser): case MinerUContentType.DISCARDED: pass - if section: + if section and parse_method == "manual": + sections.append((section, output["type"], self._line_tag(output))) + elif section and parse_method == "paper": + sections.append((section + self._line_tag(output), output["type"])) + else: sections.append((section, self._line_tag(output))) return sections @@ -516,6 +520,7 @@ class MinerUParser(RAGFlowPdfParser): method: str = "auto", server_url: Optional[str] = None, delete_output: bool = True, + parse_method: str = "raw" ) -> tuple: import shutil @@ -565,7 +570,8 @@ class MinerUParser(RAGFlowPdfParser): self.logger.info(f"[MinerU] Parsed {len(outputs)} blocks from PDF.") if callback: callback(0.75, f"[MinerU] Parsed {len(outputs)} blocks from PDF.") - return self._transfer_to_sections(outputs), self._transfer_to_tables(outputs) + + return self._transfer_to_sections(outputs, parse_method), self._transfer_to_tables(outputs) finally: if temp_pdf and temp_pdf.exists(): try: diff --git a/docker/.env b/docker/.env index d7e4b025f..6423b7824 100644 --- a/docker/.env +++ b/docker/.env @@ -230,9 +230,16 @@ REGISTER_ENABLED=1 # SANDBOX_MAX_MEMORY=256m # b, k, m, g # SANDBOX_TIMEOUT=10s # s, m, 1m30s -# Enable DocLing and Mineru +# Enable DocLing USE_DOCLING=false + +# Enable Mineru USE_MINERU=false +MINERU_EXECUTABLE="$HOME/uv_tools/.venv/bin/mineru" +MINERU_DELETE_OUTPUT=0 # keep output directory +MINERU_BACKEND=pipeline # or another backend you prefer + + # pptx support DOTNET_SYSTEM_GLOBALIZATION_INVARIANT=1 \ No newline at end of file diff --git a/rag/app/manual.py b/rag/app/manual.py index 5808e2498..124864041 100644 --- a/rag/app/manual.py +++ b/rag/app/manual.py @@ -213,6 +213,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang = lang, callback = callback, pdf_cls = Pdf, + parse_method = "manual", **kwargs ) @@ -225,7 +226,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, elif len(section) != 3: raise ValueError(f"Unexpected section length: {len(section)} (value={section!r})") - txt, sec_id, poss = section + txt, layoutno, poss = section if isinstance(poss, str): poss = pdf_parser.extract_positions(poss) first = poss[0] # tuple: ([pn], x1, x2, y1, y2) @@ -235,7 +236,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, pn = pn[0] # [pn] -> pn poss[0] = (pn, *first[1:]) - return (txt, sec_id, poss) + return (txt, layoutno, poss) sections = [_normalize_section(sec) for sec in sections] diff --git a/rag/app/naive.py b/rag/app/naive.py index 49dca17af..562336d7f 100644 --- a/rag/app/naive.py +++ b/rag/app/naive.py @@ -59,6 +59,7 @@ def by_mineru(filename, binary=None, from_page=0, to_page=100000, lang="Chinese" mineru_executable = os.environ.get("MINERU_EXECUTABLE", "mineru") mineru_api = os.environ.get("MINERU_APISERVER", "http://host.docker.internal:9987") pdf_parser = MinerUParser(mineru_path=mineru_executable, mineru_api=mineru_api) + parse_method = kwargs.get("parse_method", "raw") if not pdf_parser.check_installation(): callback(-1, "MinerU not found.") @@ -72,12 +73,14 @@ def by_mineru(filename, binary=None, from_page=0, to_page=100000, lang="Chinese" backend=os.environ.get("MINERU_BACKEND", "pipeline"), server_url=os.environ.get("MINERU_SERVER_URL", ""), delete_output=bool(int(os.environ.get("MINERU_DELETE_OUTPUT", 1))), + parse_method=parse_method ) return sections, tables, pdf_parser def by_docling(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, pdf_cls = None ,**kwargs): pdf_parser = DoclingParser() + parse_method = kwargs.get("parse_method", "raw") if not pdf_parser.check_installation(): callback(-1, "Docling not found.") @@ -89,6 +92,7 @@ def by_docling(filename, binary=None, from_page=0, to_page=100000, lang="Chinese callback=callback, output_dir=os.environ.get("MINERU_OUTPUT_DIR", ""), delete_output=bool(int(os.environ.get("MINERU_DELETE_OUTPUT", 1))), + parse_method=parse_method ) return sections, tables, pdf_parser diff --git a/rag/app/paper.py b/rag/app/paper.py index d95976c9f..222be0762 100644 --- a/rag/app/paper.py +++ b/rag/app/paper.py @@ -21,8 +21,10 @@ import re from deepdoc.parser.figure_parser import vision_figure_parser_pdf_wrapper from common.constants import ParserType from rag.nlp import rag_tokenizer, tokenize, tokenize_table, add_positions, bullets_category, title_frequency, tokenize_chunks -from deepdoc.parser import PdfParser, PlainParser +from deepdoc.parser import PdfParser import numpy as np +from rag.app.naive import by_plaintext, PARSERS + class Pdf(PdfParser): def __init__(self): @@ -147,19 +149,40 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, "parser_config", { "chunk_token_num": 512, "delimiter": "\n!?。;!?", "layout_recognize": "DeepDOC"}) if re.search(r"\.pdf$", filename, re.IGNORECASE): - if parser_config.get("layout_recognize", "DeepDOC") == "Plain Text": - pdf_parser = PlainParser() + layout_recognizer = parser_config.get("layout_recognize", "DeepDOC") + + if isinstance(layout_recognizer, bool): + layout_recognizer = "DeepDOC" if layout_recognizer else "Plain Text" + + name = layout_recognizer.strip().lower() + pdf_parser = PARSERS.get(name, by_plaintext) + callback(0.1, "Start to parse.") + + if name == "deepdoc": + pdf_parser = Pdf() + paper = pdf_parser(filename if not binary else binary, + from_page=from_page, to_page=to_page, callback=callback) + else: + sections, tables, pdf_parser = pdf_parser( + filename=filename, + binary=binary, + from_page=from_page, + to_page=to_page, + lang=lang, + callback=callback, + pdf_cls=Pdf, + parse_method="paper", + **kwargs + ) + paper = { "title": filename, "authors": " ", "abstract": "", - "sections": pdf_parser(filename if not binary else binary, from_page=from_page, to_page=to_page)[0], - "tables": [] + "sections": sections, + "tables": tables } - else: - pdf_parser = Pdf() - paper = pdf_parser(filename if not binary else binary, - from_page=from_page, to_page=to_page, callback=callback) + tbls=paper["tables"] tbls=vision_figure_parser_pdf_wrapper(tbls=tbls,callback=callback,**kwargs) paper["tables"] = tbls