Feat: Support more chunking methods (#11000)

### What problem does this PR solve?

Feat: Support more chunking methods #10772 

This PR enables multiple chunking methods — including books, laws,
naive, one, and presentation — to be used with all existing PDF parsers
(DeepDOC, MinerU, Docling, TCADP, Plain Text, and Vision modes).

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
This commit is contained in:
Billy Bao
2025-11-05 13:00:42 +08:00
committed by GitHub
parent f126875ec6
commit cf9611c96f
6 changed files with 264 additions and 133 deletions

View File

@ -15,18 +15,18 @@
# #
import logging import logging
from tika import parser
import re import re
from io import BytesIO from io import BytesIO
from deepdoc.parser.utils import get_text from deepdoc.parser.utils import get_text
from rag.app import naive from rag.app import naive
from rag.app.naive import plaintext_parser, PARSERS
from rag.nlp import bullets_category, is_english,remove_contents_table, \ from rag.nlp import bullets_category, is_english,remove_contents_table, \
hierarchical_merge, make_colon_as_title, naive_merge, random_choices, tokenize_table, \ hierarchical_merge, make_colon_as_title, naive_merge, random_choices, tokenize_table, \
tokenize_chunks tokenize_chunks
from rag.nlp import rag_tokenizer from rag.nlp import rag_tokenizer
from deepdoc.parser import PdfParser, PlainParser, HtmlParser from deepdoc.parser import PdfParser, HtmlParser
from deepdoc.parser.figure_parser import vision_figure_parser_pdf_wrapper,vision_figure_parser_docx_wrapper from deepdoc.parser.figure_parser import vision_figure_parser_docx_wrapper
from PIL import Image from PIL import Image
@ -96,13 +96,33 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
callback(0.8, "Finish parsing.") callback(0.8, "Finish parsing.")
elif re.search(r"\.pdf$", filename, re.IGNORECASE): elif re.search(r"\.pdf$", filename, re.IGNORECASE):
pdf_parser = Pdf() layout_recognizer = parser_config.get("layout_recognize", "DeepDOC")
if parser_config.get("layout_recognize", "DeepDOC") == "Plain Text":
pdf_parser = PlainParser()
sections, tbls = pdf_parser(filename if not binary else binary,
from_page=from_page, to_page=to_page, callback=callback)
tbls=vision_figure_parser_pdf_wrapper(tbls=tbls,callback=callback,**kwargs)
if isinstance(layout_recognizer, bool):
layout_recognizer = "DeepDOC" if layout_recognizer else "Plain Text"
name = layout_recognizer.strip().lower()
parser = PARSERS.get(name, plaintext_parser)
callback(0.1, "Start to parse.")
sections, tables, _ = parser(
filename = filename,
binary = binary,
from_page = from_page,
to_page = to_page,
lang = lang,
callback = callback,
pdf_cls = Pdf,
**kwargs
)
if not sections and not tables:
return []
if name in ["tcadp", "docling", "mineru"]:
parser_config["chunk_token_num"] = 0
callback(0.8, "Finish parsing.")
elif re.search(r"\.txt$", filename, re.IGNORECASE): elif re.search(r"\.txt$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.") callback(0.1, "Start to parse.")
txt = get_text(filename, binary) txt = get_text(filename, binary)

View File

@ -15,7 +15,6 @@
# #
import logging import logging
from tika import parser
import re import re
from io import BytesIO from io import BytesIO
from docx import Document from docx import Document
@ -25,8 +24,8 @@ from deepdoc.parser.utils import get_text
from rag.nlp import bullets_category, remove_contents_table, \ from rag.nlp import bullets_category, remove_contents_table, \
make_colon_as_title, tokenize_chunks, docx_question_level, tree_merge make_colon_as_title, tokenize_chunks, docx_question_level, tree_merge
from rag.nlp import rag_tokenizer, Node from rag.nlp import rag_tokenizer, Node
from deepdoc.parser import PdfParser, DocxParser, PlainParser, HtmlParser from deepdoc.parser import PdfParser, DocxParser, HtmlParser
from rag.app.naive import plaintext_parser, PARSERS
@ -156,13 +155,36 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
return tokenize_chunks(chunks, doc, eng, None) return tokenize_chunks(chunks, doc, eng, None)
elif re.search(r"\.pdf$", filename, re.IGNORECASE): elif re.search(r"\.pdf$", filename, re.IGNORECASE):
pdf_parser = Pdf() layout_recognizer = parser_config.get("layout_recognize", "DeepDOC")
if parser_config.get("layout_recognize", "DeepDOC") == "Plain Text":
pdf_parser = PlainParser() if isinstance(layout_recognizer, bool):
for txt, poss in pdf_parser(filename if not binary else binary, layout_recognizer = "DeepDOC" if layout_recognizer else "Plain Text"
from_page=from_page, to_page=to_page, callback=callback)[0]:
name = layout_recognizer.strip().lower()
parser = PARSERS.get(name, plaintext_parser)
callback(0.1, "Start to parse.")
raw_sections, tables, _ = parser(
filename = filename,
binary = binary,
from_page = from_page,
to_page = to_page,
lang = lang,
callback = callback,
pdf_cls = Pdf,
**kwargs
)
if not raw_sections and not tables:
return []
if name in ["tcadp", "docling", "mineru"]:
parser_config["chunk_token_num"] = 0
for txt, poss in raw_sections:
sections.append(txt + poss) sections.append(txt + poss)
callback(0.8, "Finish parsing.")
elif re.search(r"\.(txt|md|markdown|mdx)$", filename, re.IGNORECASE): elif re.search(r"\.(txt|md|markdown|mdx)$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.") callback(0.1, "Start to parse.")
txt = get_text(filename, binary) txt = get_text(filename, binary)

View File

@ -22,11 +22,11 @@ from common.constants import ParserType
from io import BytesIO from io import BytesIO
from rag.nlp import rag_tokenizer, tokenize, tokenize_table, bullets_category, title_frequency, tokenize_chunks, docx_question_level from rag.nlp import rag_tokenizer, tokenize, tokenize_table, bullets_category, title_frequency, tokenize_chunks, docx_question_level
from common.token_utils import num_tokens_from_string from common.token_utils import num_tokens_from_string
from deepdoc.parser import PdfParser, PlainParser, DocxParser from deepdoc.parser import PdfParser, DocxParser
from deepdoc.parser.figure_parser import vision_figure_parser_pdf_wrapper,vision_figure_parser_docx_wrapper from deepdoc.parser.figure_parser import vision_figure_parser_pdf_wrapper,vision_figure_parser_docx_wrapper
from docx import Document from docx import Document
from PIL import Image from PIL import Image
from rag.app.naive import plaintext_parser, PARSERS
class Pdf(PdfParser): class Pdf(PdfParser):
def __init__(self): def __init__(self):
@ -196,15 +196,34 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
# is it English # is it English
eng = lang.lower() == "english" # pdf_parser.is_english eng = lang.lower() == "english" # pdf_parser.is_english
if re.search(r"\.pdf$", filename, re.IGNORECASE): if re.search(r"\.pdf$", filename, re.IGNORECASE):
pdf_parser = Pdf() layout_recognizer = parser_config.get("layout_recognize", "DeepDOC")
if parser_config.get("layout_recognize", "DeepDOC") == "Plain Text":
pdf_parser = PlainParser() if isinstance(layout_recognizer, bool):
sections, tbls = pdf_parser(filename if not binary else binary, layout_recognizer = "DeepDOC" if layout_recognizer else "Plain Text"
from_page=from_page, to_page=to_page, callback=callback)
if sections and len(sections[0]) < 3: name = layout_recognizer.strip().lower()
sections = [(t, lvl, [[0] * 5]) for t, lvl in sections] pdf_parser = PARSERS.get(name, plaintext_parser)
# set pivot using the most frequent type of title, callback(0.1, "Start to parse.")
# then merge between 2 pivot
sections, tbls, pdf_parser = pdf_parser(
filename = filename,
binary = binary,
from_page = from_page,
to_page = to_page,
lang = lang,
callback = callback,
pdf_cls = Pdf,
**kwargs
)
if not sections and not tbls:
return []
if name in ["tcadp", "docling", "mineru"]:
parser_config["chunk_token_num"] = 0
callback(0.8, "Finish parsing.")
if len(sections) > 0 and len(pdf_parser.outlines) / len(sections) > 0.03: if len(sections) > 0 and len(pdf_parser.outlines) / len(sections) > 0.03:
max_lvl = max([lvl for _, lvl in pdf_parser.outlines]) max_lvl = max([lvl for _, lvl in pdf_parser.outlines])
most_level = max(0, max_lvl - 1) most_level = max(0, max_lvl - 1)

View File

@ -26,7 +26,6 @@ from docx.opc.pkgreader import _SerializedRelationships, _SerializedRelationship
from docx.opc.oxml import parse_xml from docx.opc.oxml import parse_xml
from markdown import markdown from markdown import markdown
from PIL import Image from PIL import Image
from tika import parser
from common.constants import LLMType from common.constants import LLMType
from api.db.services.llm_service import LLMBundle from api.db.services.llm_service import LLMBundle
@ -39,6 +38,100 @@ from deepdoc.parser.docling_parser import DoclingParser
from deepdoc.parser.tcadp_parser import TCADPParser from deepdoc.parser.tcadp_parser import TCADPParser
from rag.nlp import concat_img, find_codec, naive_merge, naive_merge_with_images, naive_merge_docx, rag_tokenizer, tokenize_chunks, tokenize_chunks_with_images, tokenize_table from rag.nlp import concat_img, find_codec, naive_merge, naive_merge_with_images, naive_merge_docx, rag_tokenizer, tokenize_chunks, tokenize_chunks_with_images, tokenize_table
def DeepDOC_parser(filename, binary=None, from_page=0, to_page=100000, callback=None, pdf_cls = None ,**kwargs):
callback = callback
binary = binary
pdf_parser = pdf_cls() if pdf_cls else Pdf()
sections, tables = pdf_parser(
filename if not binary else binary,
from_page=from_page,
to_page=to_page,
callback=callback
)
tables = vision_figure_parser_pdf_wrapper(tbls=tables,
callback=callback,
**kwargs)
return sections, tables, pdf_parser
def MinerU_parser(filename, binary=None, callback=None, **kwargs):
mineru_executable = os.environ.get("MINERU_EXECUTABLE", "mineru")
mineru_api = os.environ.get("MINERU_APISERVER", "http://host.docker.internal:9987")
pdf_parser = MinerUParser(mineru_path=mineru_executable, mineru_api=mineru_api)
if not pdf_parser.check_installation():
callback(-1, "MinerU not found.")
return None, None
sections, tables = pdf_parser.parse_pdf(
filepath=filename,
binary=binary,
callback=callback,
output_dir=os.environ.get("MINERU_OUTPUT_DIR", ""),
backend=os.environ.get("MINERU_BACKEND", "pipeline"),
delete_output=bool(int(os.environ.get("MINERU_DELETE_OUTPUT", 1))),
)
return sections, tables, pdf_parser
def Docling_parser(filename, binary=None, callback=None, **kwargs):
pdf_parser = DoclingParser()
if not pdf_parser.check_installation():
callback(-1, "Docling not found.")
return None, None
sections, tables = pdf_parser.parse_pdf(
filepath=filename,
binary=binary,
callback=callback,
output_dir=os.environ.get("MINERU_OUTPUT_DIR", ""),
delete_output=bool(int(os.environ.get("MINERU_DELETE_OUTPUT", 1))),
)
return sections, tables, pdf_parser
def TCADP_parser(filename, binary=None, callback=None, **kwargs):
tcadp_parser = TCADPParser()
if not tcadp_parser.check_installation():
callback(-1, "TCADP parser not available. Please check Tencent Cloud API configuration.")
return None, None
sections, tables = tcadp_parser.parse_pdf(
filepath=filename,
binary=binary,
callback=callback,
output_dir=os.environ.get("TCADP_OUTPUT_DIR", ""),
file_type="PDF"
)
return sections, tables, tcadp_parser
def plaintext_parser(filename, binary=None, from_page=0, to_page=100000, callback=None, **kwargs):
if kwargs.get("layout_recognizer", "") == "Plain Text":
pdf_parser = PlainParser()
else:
vision_model = LLMBundle(kwargs["tenant_id"], LLMType.IMAGE2TEXT, llm_name=kwargs.get("layout_recognizer", ""), lang=kwargs.get("lang", "Chinese"))
pdf_parser = VisionParser(vision_model=vision_model, **kwargs)
sections, tables = pdf_parser(
filename if not binary else binary,
from_page=from_page,
to_page=to_page,
callback=callback
)
return sections, tables, pdf_parser
PARSERS = {
"deepdoc": DeepDOC_parser,
"mineru": MinerU_parser,
"docling": Docling_parser,
"tcadp": TCADP_parser,
"plaintext": plaintext_parser, # default
}
class Docx(DocxParser): class Docx(DocxParser):
def __init__(self): def __init__(self):
@ -535,80 +628,27 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
if isinstance(layout_recognizer, bool): if isinstance(layout_recognizer, bool):
layout_recognizer = "DeepDOC" if layout_recognizer else "Plain Text" layout_recognizer = "DeepDOC" if layout_recognizer else "Plain Text"
name = layout_recognizer.strip().lower()
parser = PARSERS.get(name, plaintext_parser)
callback(0.1, "Start to parse.") callback(0.1, "Start to parse.")
if layout_recognizer == "DeepDOC": sections, tables, _ = parser(
pdf_parser = Pdf() filename = filename,
sections, tables = pdf_parser(filename if not binary else binary, from_page=from_page, to_page=to_page, callback=callback)
tables=vision_figure_parser_pdf_wrapper(tbls=tables,callback=callback,**kwargs)
res = tokenize_table(tables, doc, is_english)
callback(0.8, "Finish parsing.")
elif layout_recognizer == "MinerU":
mineru_executable = os.environ.get("MINERU_EXECUTABLE", "mineru")
mineru_api = os.environ.get("MINERU_APISERVER", "http://host.docker.internal:9987")
mineru_server_url = os.environ.get("MINERU_SERVER_URL", "")
mineru_backend = os.environ.get("MINERU_BACKEND", "pipeline")
pdf_parser = MinerUParser(mineru_path=mineru_executable, mineru_api=mineru_api, mineru_server_url=mineru_server_url)
ok, reason = pdf_parser.check_installation(backend=mineru_backend)
if not ok:
callback(-1, f"MinerU not found or server not accessible: {reason}")
return res
sections, tables = pdf_parser.parse_pdf(
filepath=filename,
binary = binary, binary = binary,
from_page = from_page,
to_page = to_page,
lang = lang,
callback = callback, callback = callback,
output_dir=os.environ.get("MINERU_OUTPUT_DIR", ""), **kwargs
backend=mineru_backend,
server_url=mineru_server_url,
delete_output=bool(int(os.environ.get("MINERU_DELETE_OUTPUT", 1))),
) )
if not sections and not tables:
return []
if name in ["tcadp", "docling", "mineru"]:
parser_config["chunk_token_num"] = 0 parser_config["chunk_token_num"] = 0
callback(0.8, "Finish parsing.")
elif layout_recognizer == "Docling":
pdf_parser = DoclingParser()
if not pdf_parser.check_installation():
callback(-1, "Docling not found.")
return res
sections, tables = pdf_parser.parse_pdf(
filepath=filename,
binary=binary,
callback=callback,
output_dir=os.environ.get("MINERU_OUTPUT_DIR", ""),
delete_output=bool(int(os.environ.get("MINERU_DELETE_OUTPUT", 1))),
)
parser_config["chunk_token_num"] = 0
res = tokenize_table(tables, doc, is_english)
callback(0.8, "Finish parsing.")
elif layout_recognizer == "TCADP Parser":
tcadp_parser = TCADPParser()
if not tcadp_parser.check_installation():
callback(-1, "TCADP parser not available. Please check Tencent Cloud API configuration.")
return res
sections, tables = tcadp_parser.parse_pdf(
filepath=filename,
binary=binary,
callback=callback,
output_dir=os.environ.get("TCADP_OUTPUT_DIR", ""),
file_type="PDF"
)
parser_config["chunk_token_num"] = 0
callback(0.8, "Finish parsing.")
else:
if layout_recognizer == "Plain Text":
pdf_parser = PlainParser()
else:
vision_model = LLMBundle(kwargs["tenant_id"], LLMType.IMAGE2TEXT, llm_name=layout_recognizer, lang=lang)
pdf_parser = VisionParser(vision_model=vision_model, **kwargs)
sections, tables = pdf_parser(filename if not binary else binary, from_page=from_page, to_page=to_page,
callback=callback)
res = tokenize_table(tables, doc, is_english) res = tokenize_table(tables, doc, is_english)
callback(0.8, "Finish parsing.") callback(0.8, "Finish parsing.")

View File

@ -15,16 +15,15 @@
# #
import logging import logging
from tika import parser
from io import BytesIO from io import BytesIO
import re import re
from deepdoc.parser.utils import get_text from deepdoc.parser.utils import get_text
from rag.app import naive from rag.app import naive
from rag.nlp import rag_tokenizer, tokenize from rag.nlp import rag_tokenizer, tokenize
from deepdoc.parser import PdfParser, ExcelParser, PlainParser, HtmlParser from deepdoc.parser import PdfParser, ExcelParser, HtmlParser
from deepdoc.parser.figure_parser import vision_figure_parser_pdf_wrapper,vision_figure_parser_docx_wrapper from deepdoc.parser.figure_parser import vision_figure_parser_docx_wrapper
from rag.app.naive import plaintext_parser, PARSERS
class Pdf(PdfParser): class Pdf(PdfParser):
def __call__(self, filename, binary=None, from_page=0, def __call__(self, filename, binary=None, from_page=0,
@ -83,12 +82,34 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
callback(0.8, "Finish parsing.") callback(0.8, "Finish parsing.")
elif re.search(r"\.pdf$", filename, re.IGNORECASE): elif re.search(r"\.pdf$", filename, re.IGNORECASE):
pdf_parser = Pdf() layout_recognizer = parser_config.get("layout_recognize", "DeepDOC")
if parser_config.get("layout_recognize", "DeepDOC") == "Plain Text":
pdf_parser = PlainParser() if isinstance(layout_recognizer, bool):
sections, tbls = pdf_parser( layout_recognizer = "DeepDOC" if layout_recognizer else "Plain Text"
filename if not binary else binary, to_page=to_page, callback=callback)
tbls=vision_figure_parser_pdf_wrapper(tbls=tbls,callback=callback,**kwargs) name = layout_recognizer.strip().lower()
parser = PARSERS.get(name, plaintext_parser)
callback(0.1, "Start to parse.")
sections, tbls, _ = parser(
filename = filename,
binary = binary,
from_page = from_page,
to_page = to_page,
lang = lang,
callback = callback,
pdf_cls = Pdf,
**kwargs
)
if not sections and not tbls:
return []
if name in ["tcadp", "docling", "mineru"]:
parser_config["chunk_token_num"] = 0
callback(0.8, "Finish parsing.")
for (img, rows), poss in tbls: for (img, rows), poss in tbls:
if not rows: if not rows:
continue continue

View File

@ -20,14 +20,11 @@ from io import BytesIO
from PIL import Image from PIL import Image
from common.constants import LLMType
from api.db.services.llm_service import LLMBundle
from deepdoc.parser.pdf_parser import VisionParser
from rag.nlp import tokenize, is_english from rag.nlp import tokenize, is_english
from rag.nlp import rag_tokenizer from rag.nlp import rag_tokenizer
from deepdoc.parser import PdfParser, PptParser, PlainParser from deepdoc.parser import PdfParser, PptParser, PlainParser
from PyPDF2 import PdfReader as pdf2_read from PyPDF2 import PdfReader as pdf2_read
from rag.app.naive import plaintext_parser, PARSERS
class Ppt(PptParser): class Ppt(PptParser):
def __call__(self, fnm, from_page, to_page, callback=None): def __call__(self, fnm, from_page, to_page, callback=None):
@ -54,7 +51,6 @@ class Ppt(PptParser):
self.is_english = is_english(txts) self.is_english = is_english(txts)
return [(txts[i], imgs[i]) for i in range(len(txts))] return [(txts[i], imgs[i]) for i in range(len(txts))]
class Pdf(PdfParser): class Pdf(PdfParser):
def __init__(self): def __init__(self):
super().__init__() super().__init__()
@ -84,7 +80,7 @@ class Pdf(PdfParser):
res.append((lines, self.page_images[i])) res.append((lines, self.page_images[i]))
callback(0.9, "Page {}~{}: Parsing finished".format( callback(0.9, "Page {}~{}: Parsing finished".format(
from_page, min(to_page, self.total_page))) from_page, min(to_page, self.total_page)))
return res return res, []
class PlainPdf(PlainParser): class PlainPdf(PlainParser):
@ -95,7 +91,7 @@ class PlainPdf(PlainParser):
for page in self.pdf.pages[from_page: to_page]: for page in self.pdf.pages[from_page: to_page]:
page_txt.append(page.extract_text()) page_txt.append(page.extract_text())
callback(0.9, "Parsing finished") callback(0.9, "Parsing finished")
return [(txt, None) for txt in page_txt] return [(txt, None) for txt in page_txt], []
def chunk(filename, binary=None, from_page=0, to_page=100000, def chunk(filename, binary=None, from_page=0, to_page=100000,
@ -130,20 +126,33 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
return res return res
elif re.search(r"\.pdf$", filename, re.IGNORECASE): elif re.search(r"\.pdf$", filename, re.IGNORECASE):
layout_recognizer = parser_config.get("layout_recognize", "DeepDOC") layout_recognizer = parser_config.get("layout_recognize", "DeepDOC")
if layout_recognizer == "DeepDOC":
pdf_parser = Pdf() if isinstance(layout_recognizer, bool):
sections = pdf_parser(filename, binary, from_page=from_page, to_page=to_page, callback=callback) layout_recognizer = "DeepDOC" if layout_recognizer else "Plain Text"
elif layout_recognizer == "Plain Text":
pdf_parser = PlainParser() name = layout_recognizer.strip().lower()
sections, _ = pdf_parser(filename if not binary else binary, from_page=from_page, to_page=to_page, parser = PARSERS.get(name, plaintext_parser)
callback=callback) callback(0.1, "Start to parse.")
else:
vision_model = LLMBundle(kwargs["tenant_id"], LLMType.IMAGE2TEXT, llm_name=layout_recognizer, lang=lang) sections, _, _ = parser(
pdf_parser = VisionParser(vision_model=vision_model, **kwargs) filename = filename,
sections, _ = pdf_parser(filename if not binary else binary, from_page=from_page, to_page=to_page, binary = binary,
callback=callback) from_page = from_page,
to_page = to_page,
lang = lang,
callback = callback,
pdf_cls = Pdf,
**kwargs
)
if not sections:
return []
if name in ["tcadp", "docling", "mineru"]:
parser_config["chunk_token_num"] = 0
callback(0.8, "Finish parsing.") callback(0.8, "Finish parsing.")
for pn, (txt, img) in enumerate(sections): for pn, (txt, img) in enumerate(sections):
d = copy.deepcopy(doc) d = copy.deepcopy(doc)
pn += from_page pn += from_page