Feat: MinerU supports VLM-Transfomers backend (#10809)

### What problem does this PR solve?

MinerU supports VLM-Transfomers backend.

Set `MINERU_BACKEND="pipeline"` to choose the backend. (Options:
pipeline | vlm-transformers, default is pipeline)

### Type of change

- [x] New Feature (non-breaking change which adds functionality)

---------

Co-authored-by: Kevin Hu <kevinhu.sh@gmail.com>
This commit is contained in:
Yongteng Lei
2025-10-27 17:04:13 +08:00
committed by GitHub
parent 16ec6ad346
commit 5acc407240
3 changed files with 33 additions and 8 deletions

View File

@ -439,7 +439,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
Successive text will be sliced into pieces using 'delimiter'.
Next, these successive pieces are merge into chunks whose token number is no more than 'Max token number'.
"""
is_english = lang.lower() == "english" # is_english(cks)
parser_config = kwargs.get(
@ -463,7 +463,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
embeds = extract_embed_file(binary)
else:
raise Exception("Embedding extraction from file path is not supported.")
# Recursively chunk each embedded file and collect results
for embed_filename, embed_bytes in embeds:
try:
@ -477,7 +477,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
if re.search(r"\.docx$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.")
# fix "There is no item named 'word/NULL' in the archive", referring to https://github.com/python-openxml/python-docx/issues/1105#issuecomment-1298075246
_SerializedRelationships.load_from_xml = load_from_xml_v2
@ -530,6 +530,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
binary=binary,
callback=callback,
output_dir=os.environ.get("MINERU_OUTPUT_DIR", ""),
backend=os.environ.get("MINERU_BACKEND", "pipeline"),
delete_output=bool(int(os.environ.get("MINERU_DELETE_OUTPUT", 1))),
)
parser_config["chunk_token_num"] = 0
@ -552,7 +553,6 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
res = tokenize_table(tables, doc, is_english)
callback(0.8, "Finish parsing.")
elif layout_recognizer == "TCADP Parser":
tcadp_parser = TCADPParser()
if not tcadp_parser.check_installation():