diff --git a/deepdoc/parser/mineru_parser.py b/deepdoc/parser/mineru_parser.py index 61d385d3f..69a565c54 100644 --- a/deepdoc/parser/mineru_parser.py +++ b/deepdoc/parser/mineru_parser.py @@ -45,6 +45,9 @@ class MinerUContentType(StrEnum): TABLE = "table" TEXT = "text" EQUATION = "equation" + CODE = "code" + LIST = "list" + DISCARDED = "discarded" class MinerUParser(RAGFlowPdfParser): @@ -80,8 +83,10 @@ class MinerUParser(RAGFlowPdfParser): logging.error(f"[MinerU] Unexpected error during installation check: {e}") return False - def _run_mineru(self, input_path: Path, output_dir: Path, method: str = "auto", lang: Optional[str] = None): + def _run_mineru(self, input_path: Path, output_dir: Path, method: str = "auto", backend: str = "pipeline", lang: Optional[str] = None): cmd = [str(self.mineru_path), "-p", str(input_path), "-o", str(output_dir), "-m", method] + if backend: + cmd.extend(["-b", backend]) if lang: cmd.extend(["-l", lang]) @@ -231,8 +236,10 @@ class MinerUParser(RAGFlowPdfParser): poss.append(([int(p) - 1 for p in pn.split("-")], left, right, top, bottom)) return poss - def _read_output(self, output_dir: Path, file_stem: str, method: str = "auto") -> list[dict[str, Any]]: + def _read_output(self, output_dir: Path, file_stem: str, method: str = "auto", backend: str = "pipeline") -> list[dict[str, Any]]: subdir = output_dir / file_stem / method + if backend.startswith("vlm-"): + subdir = output_dir / file_stem / "vlm" json_file = subdir / f"{file_stem}_content_list.json" if not json_file.exists(): @@ -259,6 +266,12 @@ class MinerUParser(RAGFlowPdfParser): section = "".join(output["image_caption"]) + "\n" + "".join(output["image_footnote"]) case MinerUContentType.EQUATION: section = output["text"] + case MinerUContentType.CODE: + section = output["code_body"] + "\n".join(output.get("code_caption", [])) + case MinerUContentType.LIST: + section = "\n".join(output.get("list_items", [])) + case MinerUContentType.DISCARDED: + pass if section: sections.append((section, self._line_tag(output))) @@ -274,6 +287,7 @@ class MinerUParser(RAGFlowPdfParser): callback: Optional[Callable] = None, *, output_dir: Optional[str] = None, + backend: str = "pipeline", lang: Optional[str] = None, method: str = "auto", delete_output: bool = True, @@ -313,8 +327,8 @@ class MinerUParser(RAGFlowPdfParser): self.__images__(pdf, zoomin=1) try: - self._run_mineru(pdf, out_dir, method=method, lang=lang) - outputs = self._read_output(out_dir, pdf.stem, method=method) + self._run_mineru(pdf, out_dir, method=method, backend=backend, lang=lang) + outputs = self._read_output(out_dir, pdf.stem, method=method, backend=backend) self.logger.info(f"[MinerU] Parsed {len(outputs)} blocks from PDF.") if callback: callback(0.75, f"[MinerU] Parsed {len(outputs)} blocks from PDF.") diff --git a/docs/faq.mdx b/docs/faq.mdx index 159e4937f..5448f078c 100644 --- a/docs/faq.mdx +++ b/docs/faq.mdx @@ -536,5 +536,16 @@ uv pip install -U "mineru[core]" -i https://mirrors.aliyun.com/pypi/simple 4. In the web UI, navigate to the **Configuration** page of your dataset. Click **Built-in** in the **Ingestion pipeline** section, select a chunking method from the **Built-in** dropdown, which supports PDF parsing, and slect **MinerU** in **PDF parser**. 5. If you use a custom ingestion pipeline instead, you must also complete the first three steps before selecting **MinerU** in the **Parsing method** section of the **Parser** component. +--- +### How to specify the settings of MinerU? +Set `MINERU_EXECUTABLE` to the path of the MinerU executable. (Default: mineru) + +Set `MINERU_DELETE_OUTPUT=0` to keep MinerU's output. (Default: 1, which deletes temporary output) + +Set `MINERU_OUTPUT_DIR` to specify the output directory. (Uses a temporary directory if unset) + +Set `MINERU_BACKEND="pipeline"` to choose the backend. (Options: pipeline | vlm-transformers, default is pipeline) + +Other environment variables listed [here](https://opendatalab.github.io/MinerU/usage/cli_tools/#environment-variables-description) are supported automatically by MinerU. diff --git a/rag/app/naive.py b/rag/app/naive.py index 29eef53b4..6c06e3b51 100644 --- a/rag/app/naive.py +++ b/rag/app/naive.py @@ -439,7 +439,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, Successive text will be sliced into pieces using 'delimiter'. Next, these successive pieces are merge into chunks whose token number is no more than 'Max token number'. """ - + is_english = lang.lower() == "english" # is_english(cks) parser_config = kwargs.get( @@ -463,7 +463,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, embeds = extract_embed_file(binary) else: raise Exception("Embedding extraction from file path is not supported.") - + # Recursively chunk each embedded file and collect results for embed_filename, embed_bytes in embeds: try: @@ -477,7 +477,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, if re.search(r"\.docx$", filename, re.IGNORECASE): callback(0.1, "Start to parse.") - + # fix "There is no item named 'word/NULL' in the archive", referring to https://github.com/python-openxml/python-docx/issues/1105#issuecomment-1298075246 _SerializedRelationships.load_from_xml = load_from_xml_v2 @@ -530,6 +530,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, binary=binary, callback=callback, output_dir=os.environ.get("MINERU_OUTPUT_DIR", ""), + backend=os.environ.get("MINERU_BACKEND", "pipeline"), delete_output=bool(int(os.environ.get("MINERU_DELETE_OUTPUT", 1))), ) parser_config["chunk_token_num"] = 0 @@ -552,7 +553,6 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, res = tokenize_table(tables, doc, is_english) callback(0.8, "Finish parsing.") - elif layout_recognizer == "TCADP Parser": tcadp_parser = TCADPParser() if not tcadp_parser.check_installation():