Feat: Support more chunking methods (#11000)

### What problem does this PR solve?

Feat: Support more chunking methods #10772 

This PR enables multiple chunking methods — including books, laws,
naive, one, and presentation — to be used with all existing PDF parsers
(DeepDOC, MinerU, Docling, TCADP, Plain Text, and Vision modes).

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
This commit is contained in:
Billy Bao
2025-11-05 13:00:42 +08:00
committed by GitHub
parent f126875ec6
commit cf9611c96f
6 changed files with 264 additions and 133 deletions

View File

@ -26,7 +26,6 @@ from docx.opc.pkgreader import _SerializedRelationships, _SerializedRelationship
from docx.opc.oxml import parse_xml
from markdown import markdown
from PIL import Image
from tika import parser
from common.constants import LLMType
from api.db.services.llm_service import LLMBundle
@ -39,6 +38,100 @@ from deepdoc.parser.docling_parser import DoclingParser
from deepdoc.parser.tcadp_parser import TCADPParser
from rag.nlp import concat_img, find_codec, naive_merge, naive_merge_with_images, naive_merge_docx, rag_tokenizer, tokenize_chunks, tokenize_chunks_with_images, tokenize_table
def DeepDOC_parser(filename, binary=None, from_page=0, to_page=100000, callback=None, pdf_cls = None ,**kwargs):
callback = callback
binary = binary
pdf_parser = pdf_cls() if pdf_cls else Pdf()
sections, tables = pdf_parser(
filename if not binary else binary,
from_page=from_page,
to_page=to_page,
callback=callback
)
tables = vision_figure_parser_pdf_wrapper(tbls=tables,
callback=callback,
**kwargs)
return sections, tables, pdf_parser
def MinerU_parser(filename, binary=None, callback=None, **kwargs):
mineru_executable = os.environ.get("MINERU_EXECUTABLE", "mineru")
mineru_api = os.environ.get("MINERU_APISERVER", "http://host.docker.internal:9987")
pdf_parser = MinerUParser(mineru_path=mineru_executable, mineru_api=mineru_api)
if not pdf_parser.check_installation():
callback(-1, "MinerU not found.")
return None, None
sections, tables = pdf_parser.parse_pdf(
filepath=filename,
binary=binary,
callback=callback,
output_dir=os.environ.get("MINERU_OUTPUT_DIR", ""),
backend=os.environ.get("MINERU_BACKEND", "pipeline"),
delete_output=bool(int(os.environ.get("MINERU_DELETE_OUTPUT", 1))),
)
return sections, tables, pdf_parser
def Docling_parser(filename, binary=None, callback=None, **kwargs):
pdf_parser = DoclingParser()
if not pdf_parser.check_installation():
callback(-1, "Docling not found.")
return None, None
sections, tables = pdf_parser.parse_pdf(
filepath=filename,
binary=binary,
callback=callback,
output_dir=os.environ.get("MINERU_OUTPUT_DIR", ""),
delete_output=bool(int(os.environ.get("MINERU_DELETE_OUTPUT", 1))),
)
return sections, tables, pdf_parser
def TCADP_parser(filename, binary=None, callback=None, **kwargs):
tcadp_parser = TCADPParser()
if not tcadp_parser.check_installation():
callback(-1, "TCADP parser not available. Please check Tencent Cloud API configuration.")
return None, None
sections, tables = tcadp_parser.parse_pdf(
filepath=filename,
binary=binary,
callback=callback,
output_dir=os.environ.get("TCADP_OUTPUT_DIR", ""),
file_type="PDF"
)
return sections, tables, tcadp_parser
def plaintext_parser(filename, binary=None, from_page=0, to_page=100000, callback=None, **kwargs):
if kwargs.get("layout_recognizer", "") == "Plain Text":
pdf_parser = PlainParser()
else:
vision_model = LLMBundle(kwargs["tenant_id"], LLMType.IMAGE2TEXT, llm_name=kwargs.get("layout_recognizer", ""), lang=kwargs.get("lang", "Chinese"))
pdf_parser = VisionParser(vision_model=vision_model, **kwargs)
sections, tables = pdf_parser(
filename if not binary else binary,
from_page=from_page,
to_page=to_page,
callback=callback
)
return sections, tables, pdf_parser
PARSERS = {
"deepdoc": DeepDOC_parser,
"mineru": MinerU_parser,
"docling": Docling_parser,
"tcadp": TCADP_parser,
"plaintext": plaintext_parser, # default
}
class Docx(DocxParser):
def __init__(self):
@ -365,7 +458,7 @@ class Markdown(MarkdownParser):
html_content = markdown(text)
soup = BeautifulSoup(html_content, 'html.parser')
return soup
def get_picture_urls(self, soup):
if soup:
return [img.get('src') for img in soup.find_all('img') if img.get('src')]
@ -375,7 +468,7 @@ class Markdown(MarkdownParser):
if soup:
return set([a.get('href') for a in soup.find_all('a') if a.get('href')])
return []
def get_pictures(self, text):
"""Download and open all images from markdown text."""
import requests
@ -535,82 +628,29 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
if isinstance(layout_recognizer, bool):
layout_recognizer = "DeepDOC" if layout_recognizer else "Plain Text"
name = layout_recognizer.strip().lower()
parser = PARSERS.get(name, plaintext_parser)
callback(0.1, "Start to parse.")
if layout_recognizer == "DeepDOC":
pdf_parser = Pdf()
sections, tables = pdf_parser(filename if not binary else binary, from_page=from_page, to_page=to_page, callback=callback)
tables=vision_figure_parser_pdf_wrapper(tbls=tables,callback=callback,**kwargs)
sections, tables, _ = parser(
filename = filename,
binary = binary,
from_page = from_page,
to_page = to_page,
lang = lang,
callback = callback,
**kwargs
)
res = tokenize_table(tables, doc, is_english)
callback(0.8, "Finish parsing.")
if not sections and not tables:
return []
elif layout_recognizer == "MinerU":
mineru_executable = os.environ.get("MINERU_EXECUTABLE", "mineru")
mineru_api = os.environ.get("MINERU_APISERVER", "http://host.docker.internal:9987")
mineru_server_url = os.environ.get("MINERU_SERVER_URL", "")
mineru_backend = os.environ.get("MINERU_BACKEND", "pipeline")
pdf_parser = MinerUParser(mineru_path=mineru_executable, mineru_api=mineru_api, mineru_server_url=mineru_server_url)
ok, reason = pdf_parser.check_installation(backend=mineru_backend)
if not ok:
callback(-1, f"MinerU not found or server not accessible: {reason}")
return res
sections, tables = pdf_parser.parse_pdf(
filepath=filename,
binary=binary,
callback=callback,
output_dir=os.environ.get("MINERU_OUTPUT_DIR", ""),
backend=mineru_backend,
server_url=mineru_server_url,
delete_output=bool(int(os.environ.get("MINERU_DELETE_OUTPUT", 1))),
)
if name in ["tcadp", "docling", "mineru"]:
parser_config["chunk_token_num"] = 0
callback(0.8, "Finish parsing.")
elif layout_recognizer == "Docling":
pdf_parser = DoclingParser()
if not pdf_parser.check_installation():
callback(-1, "Docling not found.")
return res
sections, tables = pdf_parser.parse_pdf(
filepath=filename,
binary=binary,
callback=callback,
output_dir=os.environ.get("MINERU_OUTPUT_DIR", ""),
delete_output=bool(int(os.environ.get("MINERU_DELETE_OUTPUT", 1))),
)
parser_config["chunk_token_num"] = 0
res = tokenize_table(tables, doc, is_english)
callback(0.8, "Finish parsing.")
elif layout_recognizer == "TCADP Parser":
tcadp_parser = TCADPParser()
if not tcadp_parser.check_installation():
callback(-1, "TCADP parser not available. Please check Tencent Cloud API configuration.")
return res
sections, tables = tcadp_parser.parse_pdf(
filepath=filename,
binary=binary,
callback=callback,
output_dir=os.environ.get("TCADP_OUTPUT_DIR", ""),
file_type="PDF"
)
parser_config["chunk_token_num"] = 0
callback(0.8, "Finish parsing.")
else:
if layout_recognizer == "Plain Text":
pdf_parser = PlainParser()
else:
vision_model = LLMBundle(kwargs["tenant_id"], LLMType.IMAGE2TEXT, llm_name=layout_recognizer, lang=lang)
pdf_parser = VisionParser(vision_model=vision_model, **kwargs)
sections, tables = pdf_parser(filename if not binary else binary, from_page=from_page, to_page=to_page,
callback=callback)
res = tokenize_table(tables, doc, is_english)
callback(0.8, "Finish parsing.")
res = tokenize_table(tables, doc, is_english)
callback(0.8, "Finish parsing.")
elif re.search(r"\.(csv|xlsx?)$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.")
@ -735,9 +775,9 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
logging.info(f"Failed to chunk url in registered file type {url}: {e}")
sub_url_res = chunk(f"{index}.html", html_bytes, callback=callback, lang=lang, is_root=False, **kwargs)
url_res.extend(sub_url_res)
logging.info("naive_merge({}): {}".format(filename, timer() - st))
if embed_res:
res.extend(embed_res)
if url_res: