Feat: MinerU supports VLM-Transfomers backend (#10809)

### What problem does this PR solve?

MinerU supports VLM-Transfomers backend.

Set `MINERU_BACKEND="pipeline"` to choose the backend. (Options:
pipeline | vlm-transformers, default is pipeline)

### Type of change

- [x] New Feature (non-breaking change which adds functionality)

---------

Co-authored-by: Kevin Hu <kevinhu.sh@gmail.com>
This commit is contained in:
Yongteng Lei
2025-10-27 17:04:13 +08:00
committed by GitHub
parent 16ec6ad346
commit 5acc407240
3 changed files with 33 additions and 8 deletions

View File

@ -45,6 +45,9 @@ class MinerUContentType(StrEnum):
TABLE = "table" TABLE = "table"
TEXT = "text" TEXT = "text"
EQUATION = "equation" EQUATION = "equation"
CODE = "code"
LIST = "list"
DISCARDED = "discarded"
class MinerUParser(RAGFlowPdfParser): class MinerUParser(RAGFlowPdfParser):
@ -80,8 +83,10 @@ class MinerUParser(RAGFlowPdfParser):
logging.error(f"[MinerU] Unexpected error during installation check: {e}") logging.error(f"[MinerU] Unexpected error during installation check: {e}")
return False return False
def _run_mineru(self, input_path: Path, output_dir: Path, method: str = "auto", lang: Optional[str] = None): def _run_mineru(self, input_path: Path, output_dir: Path, method: str = "auto", backend: str = "pipeline", lang: Optional[str] = None):
cmd = [str(self.mineru_path), "-p", str(input_path), "-o", str(output_dir), "-m", method] cmd = [str(self.mineru_path), "-p", str(input_path), "-o", str(output_dir), "-m", method]
if backend:
cmd.extend(["-b", backend])
if lang: if lang:
cmd.extend(["-l", lang]) cmd.extend(["-l", lang])
@ -231,8 +236,10 @@ class MinerUParser(RAGFlowPdfParser):
poss.append(([int(p) - 1 for p in pn.split("-")], left, right, top, bottom)) poss.append(([int(p) - 1 for p in pn.split("-")], left, right, top, bottom))
return poss return poss
def _read_output(self, output_dir: Path, file_stem: str, method: str = "auto") -> list[dict[str, Any]]: def _read_output(self, output_dir: Path, file_stem: str, method: str = "auto", backend: str = "pipeline") -> list[dict[str, Any]]:
subdir = output_dir / file_stem / method subdir = output_dir / file_stem / method
if backend.startswith("vlm-"):
subdir = output_dir / file_stem / "vlm"
json_file = subdir / f"{file_stem}_content_list.json" json_file = subdir / f"{file_stem}_content_list.json"
if not json_file.exists(): if not json_file.exists():
@ -259,6 +266,12 @@ class MinerUParser(RAGFlowPdfParser):
section = "".join(output["image_caption"]) + "\n" + "".join(output["image_footnote"]) section = "".join(output["image_caption"]) + "\n" + "".join(output["image_footnote"])
case MinerUContentType.EQUATION: case MinerUContentType.EQUATION:
section = output["text"] section = output["text"]
case MinerUContentType.CODE:
section = output["code_body"] + "\n".join(output.get("code_caption", []))
case MinerUContentType.LIST:
section = "\n".join(output.get("list_items", []))
case MinerUContentType.DISCARDED:
pass
if section: if section:
sections.append((section, self._line_tag(output))) sections.append((section, self._line_tag(output)))
@ -274,6 +287,7 @@ class MinerUParser(RAGFlowPdfParser):
callback: Optional[Callable] = None, callback: Optional[Callable] = None,
*, *,
output_dir: Optional[str] = None, output_dir: Optional[str] = None,
backend: str = "pipeline",
lang: Optional[str] = None, lang: Optional[str] = None,
method: str = "auto", method: str = "auto",
delete_output: bool = True, delete_output: bool = True,
@ -313,8 +327,8 @@ class MinerUParser(RAGFlowPdfParser):
self.__images__(pdf, zoomin=1) self.__images__(pdf, zoomin=1)
try: try:
self._run_mineru(pdf, out_dir, method=method, lang=lang) self._run_mineru(pdf, out_dir, method=method, backend=backend, lang=lang)
outputs = self._read_output(out_dir, pdf.stem, method=method) outputs = self._read_output(out_dir, pdf.stem, method=method, backend=backend)
self.logger.info(f"[MinerU] Parsed {len(outputs)} blocks from PDF.") self.logger.info(f"[MinerU] Parsed {len(outputs)} blocks from PDF.")
if callback: if callback:
callback(0.75, f"[MinerU] Parsed {len(outputs)} blocks from PDF.") callback(0.75, f"[MinerU] Parsed {len(outputs)} blocks from PDF.")

View File

@ -536,5 +536,16 @@ uv pip install -U "mineru[core]" -i https://mirrors.aliyun.com/pypi/simple
4. In the web UI, navigate to the **Configuration** page of your dataset. Click **Built-in** in the **Ingestion pipeline** section, select a chunking method from the **Built-in** dropdown, which supports PDF parsing, and slect **MinerU** in **PDF parser**. 4. In the web UI, navigate to the **Configuration** page of your dataset. Click **Built-in** in the **Ingestion pipeline** section, select a chunking method from the **Built-in** dropdown, which supports PDF parsing, and slect **MinerU** in **PDF parser**.
5. If you use a custom ingestion pipeline instead, you must also complete the first three steps before selecting **MinerU** in the **Parsing method** section of the **Parser** component. 5. If you use a custom ingestion pipeline instead, you must also complete the first three steps before selecting **MinerU** in the **Parsing method** section of the **Parser** component.
---
### How to specify the settings of MinerU?
Set `MINERU_EXECUTABLE` to the path of the MinerU executable. (Default: mineru)
Set `MINERU_DELETE_OUTPUT=0` to keep MinerU's output. (Default: 1, which deletes temporary output)
Set `MINERU_OUTPUT_DIR` to specify the output directory. (Uses a temporary directory if unset)
Set `MINERU_BACKEND="pipeline"` to choose the backend. (Options: pipeline | vlm-transformers, default is pipeline)
Other environment variables listed [here](https://opendatalab.github.io/MinerU/usage/cli_tools/#environment-variables-description) are supported automatically by MinerU.

View File

@ -439,7 +439,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
Successive text will be sliced into pieces using 'delimiter'. Successive text will be sliced into pieces using 'delimiter'.
Next, these successive pieces are merge into chunks whose token number is no more than 'Max token number'. Next, these successive pieces are merge into chunks whose token number is no more than 'Max token number'.
""" """
is_english = lang.lower() == "english" # is_english(cks) is_english = lang.lower() == "english" # is_english(cks)
parser_config = kwargs.get( parser_config = kwargs.get(
@ -463,7 +463,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
embeds = extract_embed_file(binary) embeds = extract_embed_file(binary)
else: else:
raise Exception("Embedding extraction from file path is not supported.") raise Exception("Embedding extraction from file path is not supported.")
# Recursively chunk each embedded file and collect results # Recursively chunk each embedded file and collect results
for embed_filename, embed_bytes in embeds: for embed_filename, embed_bytes in embeds:
try: try:
@ -477,7 +477,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
if re.search(r"\.docx$", filename, re.IGNORECASE): if re.search(r"\.docx$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.") callback(0.1, "Start to parse.")
# fix "There is no item named 'word/NULL' in the archive", referring to https://github.com/python-openxml/python-docx/issues/1105#issuecomment-1298075246 # fix "There is no item named 'word/NULL' in the archive", referring to https://github.com/python-openxml/python-docx/issues/1105#issuecomment-1298075246
_SerializedRelationships.load_from_xml = load_from_xml_v2 _SerializedRelationships.load_from_xml = load_from_xml_v2
@ -530,6 +530,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
binary=binary, binary=binary,
callback=callback, callback=callback,
output_dir=os.environ.get("MINERU_OUTPUT_DIR", ""), output_dir=os.environ.get("MINERU_OUTPUT_DIR", ""),
backend=os.environ.get("MINERU_BACKEND", "pipeline"),
delete_output=bool(int(os.environ.get("MINERU_DELETE_OUTPUT", 1))), delete_output=bool(int(os.environ.get("MINERU_DELETE_OUTPUT", 1))),
) )
parser_config["chunk_token_num"] = 0 parser_config["chunk_token_num"] = 0
@ -552,7 +553,6 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
res = tokenize_table(tables, doc, is_english) res = tokenize_table(tables, doc, is_english)
callback(0.8, "Finish parsing.") callback(0.8, "Finish parsing.")
elif layout_recognizer == "TCADP Parser": elif layout_recognizer == "TCADP Parser":
tcadp_parser = TCADPParser() tcadp_parser = TCADPParser()
if not tcadp_parser.check_installation(): if not tcadp_parser.check_installation():