Feat: MinerU supports VLM-Transfomers backend (#10809)

### What problem does this PR solve? MinerU supports VLM-Transfomers backend. Set `MINERU_BACKEND="pipeline"` to choose the backend. (Options: pipeline | vlm-transformers, default is pipeline) ### Type of change - [x] New Feature (non-breaking change which adds functionality) --------- Co-authored-by: Kevin Hu <kevinhu.sh@gmail.com>
2025-12-08 20:42:30 +08:00 · 2025-10-27 17:04:13 +08:00
parent 16ec6ad346
commit 5acc407240
3 changed files with 33 additions and 8 deletions
--- a/deepdoc/parser/mineru_parser.py
+++ b/deepdoc/parser/mineru_parser.py
@ -45,6 +45,9 @@ class MinerUContentType(StrEnum):
    TABLE = "table"
    TEXT = "text"
    EQUATION = "equation"
    CODE = "code"
    LIST = "list"
    DISCARDED = "discarded"
 class MinerUParser(RAGFlowPdfParser):
@ -80,8 +83,10 @@ class MinerUParser(RAGFlowPdfParser):
            logging.error(f"[MinerU] Unexpected error during installation check: {e}")
        return False
-    def _run_mineru(self, input_path: Path, output_dir: Path, method: str = "auto", lang: Optional[str] = None):
+    def _run_mineru(self, input_path: Path, output_dir: Path, method: str = "auto", backend: str = "pipeline", lang: Optional[str] = None):
        cmd = [str(self.mineru_path), "-p", str(input_path), "-o", str(output_dir), "-m", method]
        if backend:
            cmd.extend(["-b", backend])
        if lang:
            cmd.extend(["-l", lang])
@ -231,8 +236,10 @@ class MinerUParser(RAGFlowPdfParser):
            poss.append(([int(p) - 1 for p in pn.split("-")], left, right, top, bottom))
        return poss
-    def _read_output(self, output_dir: Path, file_stem: str, method: str = "auto") -> list[dict[str, Any]]:
+    def _read_output(self, output_dir: Path, file_stem: str, method: str = "auto", backend: str = "pipeline") -> list[dict[str, Any]]:
        subdir = output_dir / file_stem / method
        if backend.startswith("vlm-"):
            subdir = output_dir / file_stem / "vlm"
        json_file = subdir / f"{file_stem}_content_list.json"
        if not json_file.exists():
@ -259,6 +266,12 @@ class MinerUParser(RAGFlowPdfParser):
                    section = "".join(output["image_caption"]) + "\n" + "".join(output["image_footnote"])
                case MinerUContentType.EQUATION:
                    section = output["text"]
                case MinerUContentType.CODE:
                    section = output["code_body"] + "\n".join(output.get("code_caption", []))
                case MinerUContentType.LIST:
                    section = "\n".join(output.get("list_items", []))
                case MinerUContentType.DISCARDED:
                    pass
            if section:
                sections.append((section, self._line_tag(output)))
@ -274,6 +287,7 @@ class MinerUParser(RAGFlowPdfParser):
        callback: Optional[Callable] = None,
        *,
        output_dir: Optional[str] = None,
        backend: str = "pipeline",
        lang: Optional[str] = None,
        method: str = "auto",
        delete_output: bool = True,
@ -313,8 +327,8 @@ class MinerUParser(RAGFlowPdfParser):
        self.__images__(pdf, zoomin=1)
        try:
-            self._run_mineru(pdf, out_dir, method=method, lang=lang)
+            self._run_mineru(pdf, out_dir, method=method, backend=backend, lang=lang)
-            outputs = self._read_output(out_dir, pdf.stem, method=method)
+            outputs = self._read_output(out_dir, pdf.stem, method=method, backend=backend)
            self.logger.info(f"[MinerU] Parsed {len(outputs)} blocks from PDF.")
            if callback:
                callback(0.75, f"[MinerU] Parsed {len(outputs)} blocks from PDF.")
--- a/docs/faq.mdx
+++ b/docs/faq.mdx
@ -536,5 +536,16 @@ uv pip install -U "mineru[core]" -i https://mirrors.aliyun.com/pypi/simple
 4. In the web UI, navigate to the **Configuration** page of your dataset. Click **Built-in** in the **Ingestion pipeline** section, select a chunking method from the **Built-in** dropdown, which supports PDF parsing, and slect **MinerU** in **PDF parser**.
 5. If you use a custom ingestion pipeline instead, you must also complete the first three steps before selecting **MinerU** in the **Parsing method** section of the **Parser** component.
 ---
 ### How to specify the settings of MinerU?
 Set `MINERU_EXECUTABLE` to the path of the MinerU executable. (Default: mineru)
 Set `MINERU_DELETE_OUTPUT=0` to keep MinerU's output. (Default: 1, which deletes temporary output)
 Set `MINERU_OUTPUT_DIR` to specify the output directory. (Uses a temporary directory if unset)
 Set `MINERU_BACKEND="pipeline"` to choose the backend. (Options: pipeline | vlm-transformers, default is pipeline)
 Other environment variables listed [here](https://opendatalab.github.io/MinerU/usage/cli_tools/#environment-variables-description) are supported automatically by MinerU.
--- a/rag/app/naive.py
+++ b/rag/app/naive.py
@ -439,7 +439,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
        Successive text will be sliced into pieces using 'delimiter'.
        Next, these successive pieces are merge into chunks whose token number is no more than 'Max token number'.
    """
-    
+
    is_english = lang.lower() == "english"  # is_english(cks)
    parser_config = kwargs.get(
@ -463,7 +463,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
            embeds = extract_embed_file(binary)
        else:
            raise Exception("Embedding extraction from file path is not supported.")
-        
+
        # Recursively chunk each embedded file and collect results
        for embed_filename, embed_bytes in embeds:
            try:
@ -477,7 +477,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
    if re.search(r"\.docx$", filename, re.IGNORECASE):
        callback(0.1, "Start to parse.")
-        
+
        # fix "There is no item named 'word/NULL' in the archive", referring to https://github.com/python-openxml/python-docx/issues/1105#issuecomment-1298075246
        _SerializedRelationships.load_from_xml = load_from_xml_v2
@ -530,6 +530,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
                binary=binary,
                callback=callback,
                output_dir=os.environ.get("MINERU_OUTPUT_DIR", ""),
                backend=os.environ.get("MINERU_BACKEND", "pipeline"),
                delete_output=bool(int(os.environ.get("MINERU_DELETE_OUTPUT", 1))),
            )
            parser_config["chunk_token_num"] = 0
@ -552,7 +553,6 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
            res = tokenize_table(tables, doc, is_english)
            callback(0.8, "Finish parsing.")
        elif layout_recognizer == "TCADP Parser":
            tcadp_parser = TCADPParser()
            if not tcadp_parser.check_installation():