Feat: add more chunking method (#11413)

### What problem does this PR solve?

Feat: add more chunking method #11311

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
This commit is contained in:
Billy Bao
2025-11-20 19:07:17 +08:00
committed by GitHub
parent c8ab9079b3
commit d3d2ccc76c
6 changed files with 66 additions and 19 deletions

View File

@ -187,7 +187,7 @@ class DoclingParser(RAGFlowPdfParser):
bbox = _BBox(int(pn), bb[0], bb[1], bb[2], bb[3]) bbox = _BBox(int(pn), bb[0], bb[1], bb[2], bb[3])
yield (DoclingContentType.EQUATION.value, text, bbox) yield (DoclingContentType.EQUATION.value, text, bbox)
def _transfer_to_sections(self, doc) -> list[tuple[str, str]]: def _transfer_to_sections(self, doc, parse_method: str) -> list[tuple[str, str]]:
sections: list[tuple[str, str]] = [] sections: list[tuple[str, str]] = []
for typ, payload, bbox in self._iter_doc_items(doc): for typ, payload, bbox in self._iter_doc_items(doc):
if typ == DoclingContentType.TEXT.value: if typ == DoclingContentType.TEXT.value:
@ -200,7 +200,12 @@ class DoclingParser(RAGFlowPdfParser):
continue continue
tag = self._make_line_tag(bbox) if isinstance(bbox,_BBox) else "" tag = self._make_line_tag(bbox) if isinstance(bbox,_BBox) else ""
sections.append((section, tag)) if parse_method == "manual":
sections.append((section, typ, tag))
elif parse_method == "paper":
sections.append((section + tag, typ))
else:
sections.append((section, tag))
return sections return sections
def cropout_docling_table(self, page_no: int, bbox: tuple[float, float, float, float], zoomin: int = 1): def cropout_docling_table(self, page_no: int, bbox: tuple[float, float, float, float], zoomin: int = 1):
@ -283,6 +288,7 @@ class DoclingParser(RAGFlowPdfParser):
lang: Optional[str] = None, lang: Optional[str] = None,
method: str = "auto", method: str = "auto",
delete_output: bool = True, delete_output: bool = True,
parse_method: str = "raw"
): ):
if not self.check_installation(): if not self.check_installation():
@ -318,7 +324,7 @@ class DoclingParser(RAGFlowPdfParser):
if callback: if callback:
callback(0.7, f"[Docling] Parsed doc: {getattr(doc, 'num_pages', 'n/a')} pages") callback(0.7, f"[Docling] Parsed doc: {getattr(doc, 'num_pages', 'n/a')} pages")
sections = self._transfer_to_sections(doc) sections = self._transfer_to_sections(doc, parse_method=parse_method)
tables = self._transfer_to_tables(doc) tables = self._transfer_to_tables(doc)
if callback: if callback:

View File

@ -476,7 +476,7 @@ class MinerUParser(RAGFlowPdfParser):
item[key] = str((subdir / item[key]).resolve()) item[key] = str((subdir / item[key]).resolve())
return data return data
def _transfer_to_sections(self, outputs: list[dict[str, Any]]): def _transfer_to_sections(self, outputs: list[dict[str, Any]], parse_method: str = None):
sections = [] sections = []
for output in outputs: for output in outputs:
match output["type"]: match output["type"]:
@ -497,7 +497,11 @@ class MinerUParser(RAGFlowPdfParser):
case MinerUContentType.DISCARDED: case MinerUContentType.DISCARDED:
pass pass
if section: if section and parse_method == "manual":
sections.append((section, output["type"], self._line_tag(output)))
elif section and parse_method == "paper":
sections.append((section + self._line_tag(output), output["type"]))
else:
sections.append((section, self._line_tag(output))) sections.append((section, self._line_tag(output)))
return sections return sections
@ -516,6 +520,7 @@ class MinerUParser(RAGFlowPdfParser):
method: str = "auto", method: str = "auto",
server_url: Optional[str] = None, server_url: Optional[str] = None,
delete_output: bool = True, delete_output: bool = True,
parse_method: str = "raw"
) -> tuple: ) -> tuple:
import shutil import shutil
@ -565,7 +570,8 @@ class MinerUParser(RAGFlowPdfParser):
self.logger.info(f"[MinerU] Parsed {len(outputs)} blocks from PDF.") self.logger.info(f"[MinerU] Parsed {len(outputs)} blocks from PDF.")
if callback: if callback:
callback(0.75, f"[MinerU] Parsed {len(outputs)} blocks from PDF.") callback(0.75, f"[MinerU] Parsed {len(outputs)} blocks from PDF.")
return self._transfer_to_sections(outputs), self._transfer_to_tables(outputs)
return self._transfer_to_sections(outputs, parse_method), self._transfer_to_tables(outputs)
finally: finally:
if temp_pdf and temp_pdf.exists(): if temp_pdf and temp_pdf.exists():
try: try:

View File

@ -230,9 +230,16 @@ REGISTER_ENABLED=1
# SANDBOX_MAX_MEMORY=256m # b, k, m, g # SANDBOX_MAX_MEMORY=256m # b, k, m, g
# SANDBOX_TIMEOUT=10s # s, m, 1m30s # SANDBOX_TIMEOUT=10s # s, m, 1m30s
# Enable DocLing and Mineru # Enable DocLing
USE_DOCLING=false USE_DOCLING=false
# Enable Mineru
USE_MINERU=false USE_MINERU=false
MINERU_EXECUTABLE="$HOME/uv_tools/.venv/bin/mineru"
MINERU_DELETE_OUTPUT=0 # keep output directory
MINERU_BACKEND=pipeline # or another backend you prefer
# pptx support # pptx support
DOTNET_SYSTEM_GLOBALIZATION_INVARIANT=1 DOTNET_SYSTEM_GLOBALIZATION_INVARIANT=1

View File

@ -213,6 +213,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
lang = lang, lang = lang,
callback = callback, callback = callback,
pdf_cls = Pdf, pdf_cls = Pdf,
parse_method = "manual",
**kwargs **kwargs
) )
@ -225,7 +226,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
elif len(section) != 3: elif len(section) != 3:
raise ValueError(f"Unexpected section length: {len(section)} (value={section!r})") raise ValueError(f"Unexpected section length: {len(section)} (value={section!r})")
txt, sec_id, poss = section txt, layoutno, poss = section
if isinstance(poss, str): if isinstance(poss, str):
poss = pdf_parser.extract_positions(poss) poss = pdf_parser.extract_positions(poss)
first = poss[0] # tuple: ([pn], x1, x2, y1, y2) first = poss[0] # tuple: ([pn], x1, x2, y1, y2)
@ -235,7 +236,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
pn = pn[0] # [pn] -> pn pn = pn[0] # [pn] -> pn
poss[0] = (pn, *first[1:]) poss[0] = (pn, *first[1:])
return (txt, sec_id, poss) return (txt, layoutno, poss)
sections = [_normalize_section(sec) for sec in sections] sections = [_normalize_section(sec) for sec in sections]

View File

@ -59,6 +59,7 @@ def by_mineru(filename, binary=None, from_page=0, to_page=100000, lang="Chinese"
mineru_executable = os.environ.get("MINERU_EXECUTABLE", "mineru") mineru_executable = os.environ.get("MINERU_EXECUTABLE", "mineru")
mineru_api = os.environ.get("MINERU_APISERVER", "http://host.docker.internal:9987") mineru_api = os.environ.get("MINERU_APISERVER", "http://host.docker.internal:9987")
pdf_parser = MinerUParser(mineru_path=mineru_executable, mineru_api=mineru_api) pdf_parser = MinerUParser(mineru_path=mineru_executable, mineru_api=mineru_api)
parse_method = kwargs.get("parse_method", "raw")
if not pdf_parser.check_installation(): if not pdf_parser.check_installation():
callback(-1, "MinerU not found.") callback(-1, "MinerU not found.")
@ -72,12 +73,14 @@ def by_mineru(filename, binary=None, from_page=0, to_page=100000, lang="Chinese"
backend=os.environ.get("MINERU_BACKEND", "pipeline"), backend=os.environ.get("MINERU_BACKEND", "pipeline"),
server_url=os.environ.get("MINERU_SERVER_URL", ""), server_url=os.environ.get("MINERU_SERVER_URL", ""),
delete_output=bool(int(os.environ.get("MINERU_DELETE_OUTPUT", 1))), delete_output=bool(int(os.environ.get("MINERU_DELETE_OUTPUT", 1))),
parse_method=parse_method
) )
return sections, tables, pdf_parser return sections, tables, pdf_parser
def by_docling(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, pdf_cls = None ,**kwargs): def by_docling(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, pdf_cls = None ,**kwargs):
pdf_parser = DoclingParser() pdf_parser = DoclingParser()
parse_method = kwargs.get("parse_method", "raw")
if not pdf_parser.check_installation(): if not pdf_parser.check_installation():
callback(-1, "Docling not found.") callback(-1, "Docling not found.")
@ -89,6 +92,7 @@ def by_docling(filename, binary=None, from_page=0, to_page=100000, lang="Chinese
callback=callback, callback=callback,
output_dir=os.environ.get("MINERU_OUTPUT_DIR", ""), output_dir=os.environ.get("MINERU_OUTPUT_DIR", ""),
delete_output=bool(int(os.environ.get("MINERU_DELETE_OUTPUT", 1))), delete_output=bool(int(os.environ.get("MINERU_DELETE_OUTPUT", 1))),
parse_method=parse_method
) )
return sections, tables, pdf_parser return sections, tables, pdf_parser

View File

@ -21,8 +21,10 @@ import re
from deepdoc.parser.figure_parser import vision_figure_parser_pdf_wrapper from deepdoc.parser.figure_parser import vision_figure_parser_pdf_wrapper
from common.constants import ParserType from common.constants import ParserType
from rag.nlp import rag_tokenizer, tokenize, tokenize_table, add_positions, bullets_category, title_frequency, tokenize_chunks from rag.nlp import rag_tokenizer, tokenize, tokenize_table, add_positions, bullets_category, title_frequency, tokenize_chunks
from deepdoc.parser import PdfParser, PlainParser from deepdoc.parser import PdfParser
import numpy as np import numpy as np
from rag.app.naive import by_plaintext, PARSERS
class Pdf(PdfParser): class Pdf(PdfParser):
def __init__(self): def __init__(self):
@ -147,19 +149,40 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
"parser_config", { "parser_config", {
"chunk_token_num": 512, "delimiter": "\n!?。;!?", "layout_recognize": "DeepDOC"}) "chunk_token_num": 512, "delimiter": "\n!?。;!?", "layout_recognize": "DeepDOC"})
if re.search(r"\.pdf$", filename, re.IGNORECASE): if re.search(r"\.pdf$", filename, re.IGNORECASE):
if parser_config.get("layout_recognize", "DeepDOC") == "Plain Text": layout_recognizer = parser_config.get("layout_recognize", "DeepDOC")
pdf_parser = PlainParser()
if isinstance(layout_recognizer, bool):
layout_recognizer = "DeepDOC" if layout_recognizer else "Plain Text"
name = layout_recognizer.strip().lower()
pdf_parser = PARSERS.get(name, by_plaintext)
callback(0.1, "Start to parse.")
if name == "deepdoc":
pdf_parser = Pdf()
paper = pdf_parser(filename if not binary else binary,
from_page=from_page, to_page=to_page, callback=callback)
else:
sections, tables, pdf_parser = pdf_parser(
filename=filename,
binary=binary,
from_page=from_page,
to_page=to_page,
lang=lang,
callback=callback,
pdf_cls=Pdf,
parse_method="paper",
**kwargs
)
paper = { paper = {
"title": filename, "title": filename,
"authors": " ", "authors": " ",
"abstract": "", "abstract": "",
"sections": pdf_parser(filename if not binary else binary, from_page=from_page, to_page=to_page)[0], "sections": sections,
"tables": [] "tables": tables
} }
else:
pdf_parser = Pdf()
paper = pdf_parser(filename if not binary else binary,
from_page=from_page, to_page=to_page, callback=callback)
tbls=paper["tables"] tbls=paper["tables"]
tbls=vision_figure_parser_pdf_wrapper(tbls=tbls,callback=callback,**kwargs) tbls=vision_figure_parser_pdf_wrapper(tbls=tbls,callback=callback,**kwargs)
paper["tables"] = tbls paper["tables"] = tbls