mirror of
https://github.com/infiniflow/ragflow.git
synced 2025-12-08 20:42:30 +08:00
Feat: supports MinerU http-client/server method (#10961)
### What problem does this PR solve? Add support for MinerU http-client/server method. To use MinerU with vLLM server: 1. Set up a vLLM server running MinerU: ```bash mineru-vllm-server --port 30000 ``` 2. Configure the following environment variables: - `MINERU_EXECUTABLE=/ragflow/uv_tools/.venv/bin/mineru` (or the path to your MinerU executable) - `MINERU_BACKEND="vlm-http-client"` - `MINERU_SERVER_URL="http://your-vllm-server-ip:30000"` 3. Follow the standard MinerU setup steps as described above. With this configuration, RAGFlow will connect to your vLLM server to perform document parsing, which can significantly improve parsing performance for complex documents while reducing the resource requirements on your RAGFlow server.   ### Type of change - [x] New Feature (non-breaking change which adds functionality) - [x] Documentation Update --------- Co-authored-by: writinwaters <cai.keith@gmail.com>
This commit is contained in:
@ -15,6 +15,7 @@
|
|||||||
#
|
#
|
||||||
import json
|
import json
|
||||||
import logging
|
import logging
|
||||||
|
import os
|
||||||
import platform
|
import platform
|
||||||
import re
|
import re
|
||||||
import subprocess
|
import subprocess
|
||||||
@ -22,17 +23,16 @@ import sys
|
|||||||
import tempfile
|
import tempfile
|
||||||
import threading
|
import threading
|
||||||
import time
|
import time
|
||||||
|
import zipfile
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
from os import PathLike
|
from os import PathLike
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from queue import Empty, Queue
|
from queue import Empty, Queue
|
||||||
from typing import Any, Callable, Optional
|
from typing import Any, Callable, Optional
|
||||||
import requests
|
|
||||||
import os
|
|
||||||
import zipfile
|
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import pdfplumber
|
import pdfplumber
|
||||||
|
import requests
|
||||||
from PIL import Image
|
from PIL import Image
|
||||||
from strenum import StrEnum
|
from strenum import StrEnum
|
||||||
|
|
||||||
@ -54,22 +54,23 @@ class MinerUContentType(StrEnum):
|
|||||||
|
|
||||||
|
|
||||||
class MinerUParser(RAGFlowPdfParser):
|
class MinerUParser(RAGFlowPdfParser):
|
||||||
def __init__(self, mineru_path: str = "mineru", mineru_api: str = "http://host.docker.internal:9987"):
|
def __init__(self, mineru_path: str = "mineru", mineru_api: str = "http://host.docker.internal:9987", mineru_server_url: str = ""):
|
||||||
self.mineru_path = Path(mineru_path)
|
self.mineru_path = Path(mineru_path)
|
||||||
self.mineru_api = mineru_api.rstrip('/')
|
self.mineru_api = mineru_api.rstrip("/")
|
||||||
|
self.mineru_server_url = mineru_server_url.rstrip("/")
|
||||||
self.using_api = False
|
self.using_api = False
|
||||||
self.logger = logging.getLogger(self.__class__.__name__)
|
self.logger = logging.getLogger(self.__class__.__name__)
|
||||||
|
|
||||||
def _extract_zip_no_root(self, zip_path, extract_to, root_dir):
|
def _extract_zip_no_root(self, zip_path, extract_to, root_dir):
|
||||||
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
|
with zipfile.ZipFile(zip_path, "r") as zip_ref:
|
||||||
if not root_dir:
|
if not root_dir:
|
||||||
files = zip_ref.namelist()
|
files = zip_ref.namelist()
|
||||||
if files and files[0].endswith('/'):
|
if files and files[0].endswith("/"):
|
||||||
root_dir = files[0]
|
root_dir = files[0]
|
||||||
else:
|
else:
|
||||||
root_dir = None
|
root_dir = None
|
||||||
|
|
||||||
if not root_dir or not root_dir.endswith('/'):
|
if not root_dir or not root_dir.endswith("/"):
|
||||||
self.logger.info(f"[MinerU] No root directory found, extracting all...fff{root_dir}")
|
self.logger.info(f"[MinerU] No root directory found, extracting all...fff{root_dir}")
|
||||||
zip_ref.extractall(extract_to)
|
zip_ref.extractall(extract_to)
|
||||||
return
|
return
|
||||||
@ -90,7 +91,7 @@ class MinerUParser(RAGFlowPdfParser):
|
|||||||
os.makedirs(full_path, exist_ok=True)
|
os.makedirs(full_path, exist_ok=True)
|
||||||
else:
|
else:
|
||||||
os.makedirs(os.path.dirname(full_path), exist_ok=True)
|
os.makedirs(os.path.dirname(full_path), exist_ok=True)
|
||||||
with open(full_path, 'wb') as f:
|
with open(full_path, "wb") as f:
|
||||||
f.write(zip_ref.read(filename))
|
f.write(zip_ref.read(filename))
|
||||||
|
|
||||||
def _is_http_endpoint_valid(self, url, timeout=5):
|
def _is_http_endpoint_valid(self, url, timeout=5):
|
||||||
@ -100,7 +101,15 @@ class MinerUParser(RAGFlowPdfParser):
|
|||||||
except Exception:
|
except Exception:
|
||||||
return False
|
return False
|
||||||
|
|
||||||
def check_installation(self) -> bool:
|
def check_installation(self, backend: str = "pipeline", server_url: Optional[str] = None) -> tuple[bool, str]:
|
||||||
|
reason = ""
|
||||||
|
|
||||||
|
valid_backends = ["pipeline", "vlm-http-client", "vlm-transformers", "vlm-vllm-engine"]
|
||||||
|
if backend not in valid_backends:
|
||||||
|
reason = "[MinerU] Invalid backend '{backend}'. Valid backends are: {valid_backends}"
|
||||||
|
logging.warning(reason)
|
||||||
|
return False, reason
|
||||||
|
|
||||||
subprocess_kwargs = {
|
subprocess_kwargs = {
|
||||||
"capture_output": True,
|
"capture_output": True,
|
||||||
"text": True,
|
"text": True,
|
||||||
@ -112,6 +121,32 @@ class MinerUParser(RAGFlowPdfParser):
|
|||||||
if platform.system() == "Windows":
|
if platform.system() == "Windows":
|
||||||
subprocess_kwargs["creationflags"] = getattr(subprocess, "CREATE_NO_WINDOW", 0)
|
subprocess_kwargs["creationflags"] = getattr(subprocess, "CREATE_NO_WINDOW", 0)
|
||||||
|
|
||||||
|
if server_url is None:
|
||||||
|
server_url = self.mineru_server_url
|
||||||
|
|
||||||
|
if backend == "vlm-http-client" and server_url:
|
||||||
|
try:
|
||||||
|
server_accessible = self._is_http_endpoint_valid(server_url + "/openapi.json")
|
||||||
|
logging.info(f"[MinerU] vlm-http-client server check: {server_accessible}")
|
||||||
|
if server_accessible:
|
||||||
|
self.using_api = False # We are using http client, not API
|
||||||
|
return True, reason
|
||||||
|
else:
|
||||||
|
reason = f"[MinerU] vlm-http-client server not accessible: {server_url}"
|
||||||
|
logging.warning(f"[MinerU] vlm-http-client server not accessible: {server_url}")
|
||||||
|
return False, reason
|
||||||
|
except Exception as e:
|
||||||
|
logging.warning(f"[MinerU] vlm-http-client server check failed: {e}")
|
||||||
|
try:
|
||||||
|
response = requests.get(server_url, timeout=5)
|
||||||
|
logging.info(f"[MinerU] vlm-http-client server connection check: success with status {response.status_code}")
|
||||||
|
self.using_api = False
|
||||||
|
return True, reason
|
||||||
|
except Exception as e:
|
||||||
|
reason = f"[MinerU] vlm-http-client server connection check failed: {server_url}: {e}"
|
||||||
|
logging.warning(f"[MinerU] vlm-http-client server connection check failed: {server_url}: {e}")
|
||||||
|
return False, reason
|
||||||
|
|
||||||
try:
|
try:
|
||||||
result = subprocess.run([str(self.mineru_path), "--version"], **subprocess_kwargs)
|
result = subprocess.run([str(self.mineru_path), "--version"], **subprocess_kwargs)
|
||||||
version_info = result.stdout.strip()
|
version_info = result.stdout.strip()
|
||||||
@ -119,7 +154,7 @@ class MinerUParser(RAGFlowPdfParser):
|
|||||||
logging.info(f"[MinerU] Detected version: {version_info}")
|
logging.info(f"[MinerU] Detected version: {version_info}")
|
||||||
else:
|
else:
|
||||||
logging.info("[MinerU] Detected MinerU, but version info is empty.")
|
logging.info("[MinerU] Detected MinerU, but version info is empty.")
|
||||||
return True
|
return True, reason
|
||||||
except subprocess.CalledProcessError as e:
|
except subprocess.CalledProcessError as e:
|
||||||
logging.warning(f"[MinerU] Execution failed (exit code {e.returncode}).")
|
logging.warning(f"[MinerU] Execution failed (exit code {e.returncode}).")
|
||||||
except FileNotFoundError:
|
except FileNotFoundError:
|
||||||
@ -127,24 +162,31 @@ class MinerUParser(RAGFlowPdfParser):
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
logging.error(f"[MinerU] Unexpected error during installation check: {e}")
|
logging.error(f"[MinerU] Unexpected error during installation check: {e}")
|
||||||
|
|
||||||
|
# If executable check fails, try API check
|
||||||
try:
|
try:
|
||||||
if self.mineru_api:
|
if self.mineru_api:
|
||||||
# check openapi.json
|
# check openapi.json
|
||||||
openapi_exists = self._is_http_endpoint_valid(self.mineru_api + "/openapi.json")
|
openapi_exists = self._is_http_endpoint_valid(self.mineru_api + "/openapi.json")
|
||||||
|
if not openapi_exists:
|
||||||
|
reason = "[MinerU] Failed to detect vaild MinerU API server"
|
||||||
|
return openapi_exists, reason
|
||||||
logging.info(f"[MinerU] Detected {self.mineru_api}/openapi.json: {openapi_exists}")
|
logging.info(f"[MinerU] Detected {self.mineru_api}/openapi.json: {openapi_exists}")
|
||||||
self.using_api = openapi_exists
|
self.using_api = openapi_exists
|
||||||
return openapi_exists
|
return openapi_exists, reason
|
||||||
else:
|
else:
|
||||||
logging.info("[MinerU] api not exists.")
|
logging.info("[MinerU] api not exists.")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
reason = f"[MinerU] Unexpected error during api check: {e}"
|
||||||
logging.error(f"[MinerU] Unexpected error during api check: {e}")
|
logging.error(f"[MinerU] Unexpected error during api check: {e}")
|
||||||
return False
|
return False, reason
|
||||||
|
|
||||||
def _run_mineru(self, input_path: Path, output_dir: Path, method: str = "auto", backend: str = "pipeline", lang: Optional[str] = None, callback: Optional[Callable] = None):
|
def _run_mineru(
|
||||||
|
self, input_path: Path, output_dir: Path, method: str = "auto", backend: str = "pipeline", lang: Optional[str] = None, server_url: Optional[str] = None, callback: Optional[Callable] = None
|
||||||
|
):
|
||||||
if self.using_api:
|
if self.using_api:
|
||||||
self._run_mineru_api(input_path, output_dir, method, backend, lang, callback)
|
self._run_mineru_api(input_path, output_dir, method, backend, lang, callback)
|
||||||
else:
|
else:
|
||||||
self._run_mineru_executable(input_path, output_dir, method, backend, lang, callback)
|
self._run_mineru_executable(input_path, output_dir, method, backend, lang, server_url, callback)
|
||||||
|
|
||||||
def _run_mineru_api(self, input_path: Path, output_dir: Path, method: str = "auto", backend: str = "pipeline", lang: Optional[str] = None, callback: Optional[Callable] = None):
|
def _run_mineru_api(self, input_path: Path, output_dir: Path, method: str = "auto", backend: str = "pipeline", lang: Optional[str] = None, callback: Optional[Callable] = None):
|
||||||
OUTPUT_ZIP_PATH = os.path.join(str(output_dir), "output.zip")
|
OUTPUT_ZIP_PATH = os.path.join(str(output_dir), "output.zip")
|
||||||
@ -158,9 +200,7 @@ class MinerUParser(RAGFlowPdfParser):
|
|||||||
output_path = os.path.join(str(output_dir), pdf_file_name, method)
|
output_path = os.path.join(str(output_dir), pdf_file_name, method)
|
||||||
os.makedirs(output_path, exist_ok=True)
|
os.makedirs(output_path, exist_ok=True)
|
||||||
|
|
||||||
files = {
|
files = {"files": (pdf_file_name + ".pdf", open(pdf_file_path, "rb"), "application/pdf")}
|
||||||
"files": (pdf_file_name + ".pdf", open(pdf_file_path, "rb"), "application/pdf")
|
|
||||||
}
|
|
||||||
|
|
||||||
data = {
|
data = {
|
||||||
"output_dir": "./output",
|
"output_dir": "./output",
|
||||||
@ -177,23 +217,15 @@ class MinerUParser(RAGFlowPdfParser):
|
|||||||
"return_images": True,
|
"return_images": True,
|
||||||
"response_format_zip": True,
|
"response_format_zip": True,
|
||||||
"start_page_id": 0,
|
"start_page_id": 0,
|
||||||
"end_page_id": 99999
|
"end_page_id": 99999,
|
||||||
}
|
}
|
||||||
|
|
||||||
headers = {
|
headers = {"Accept": "application/json"}
|
||||||
"Accept": "application/json"
|
|
||||||
}
|
|
||||||
try:
|
try:
|
||||||
self.logger.info(f"[MinerU] invoke api: {self.mineru_api}/file_parse")
|
self.logger.info(f"[MinerU] invoke api: {self.mineru_api}/file_parse")
|
||||||
if callback:
|
if callback:
|
||||||
callback(0.20, f"[MinerU] invoke api: {self.mineru_api}/file_parse")
|
callback(0.20, f"[MinerU] invoke api: {self.mineru_api}/file_parse")
|
||||||
response = requests.post(
|
response = requests.post(url=f"{self.mineru_api}/file_parse", files=files, data=data, headers=headers, timeout=1800)
|
||||||
url=f"{self.mineru_api}/file_parse",
|
|
||||||
files=files,
|
|
||||||
data=data,
|
|
||||||
headers=headers,
|
|
||||||
timeout=1800
|
|
||||||
)
|
|
||||||
|
|
||||||
response.raise_for_status()
|
response.raise_for_status()
|
||||||
if response.headers.get("Content-Type") == "application/zip":
|
if response.headers.get("Content-Type") == "application/zip":
|
||||||
@ -216,12 +248,16 @@ class MinerUParser(RAGFlowPdfParser):
|
|||||||
raise RuntimeError(f"[MinerU] api failed with exception {e}")
|
raise RuntimeError(f"[MinerU] api failed with exception {e}")
|
||||||
self.logger.info("[MinerU] Api completed successfully.")
|
self.logger.info("[MinerU] Api completed successfully.")
|
||||||
|
|
||||||
def _run_mineru_executable(self, input_path: Path, output_dir: Path, method: str = "auto", backend: str = "pipeline", lang: Optional[str] = None, callback: Optional[Callable] = None):
|
def _run_mineru_executable(
|
||||||
|
self, input_path: Path, output_dir: Path, method: str = "auto", backend: str = "pipeline", lang: Optional[str] = None, server_url: Optional[str] = None, callback: Optional[Callable] = None
|
||||||
|
):
|
||||||
cmd = [str(self.mineru_path), "-p", str(input_path), "-o", str(output_dir), "-m", method]
|
cmd = [str(self.mineru_path), "-p", str(input_path), "-o", str(output_dir), "-m", method]
|
||||||
if backend:
|
if backend:
|
||||||
cmd.extend(["-b", backend])
|
cmd.extend(["-b", backend])
|
||||||
if lang:
|
if lang:
|
||||||
cmd.extend(["-l", lang])
|
cmd.extend(["-l", lang])
|
||||||
|
if server_url and backend == "vlm-http-client":
|
||||||
|
cmd.extend(["-u", server_url])
|
||||||
|
|
||||||
self.logger.info(f"[MinerU] Running command: {' '.join(cmd)}")
|
self.logger.info(f"[MinerU] Running command: {' '.join(cmd)}")
|
||||||
|
|
||||||
@ -425,6 +461,7 @@ class MinerUParser(RAGFlowPdfParser):
|
|||||||
backend: str = "pipeline",
|
backend: str = "pipeline",
|
||||||
lang: Optional[str] = None,
|
lang: Optional[str] = None,
|
||||||
method: str = "auto",
|
method: str = "auto",
|
||||||
|
server_url: Optional[str] = None,
|
||||||
delete_output: bool = True,
|
delete_output: bool = True,
|
||||||
) -> tuple:
|
) -> tuple:
|
||||||
import shutil
|
import shutil
|
||||||
@ -470,7 +507,7 @@ class MinerUParser(RAGFlowPdfParser):
|
|||||||
self.__images__(pdf, zoomin=1)
|
self.__images__(pdf, zoomin=1)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
self._run_mineru(pdf, out_dir, method=method, backend=backend, lang=lang, callback=callback)
|
self._run_mineru(pdf, out_dir, method=method, backend=backend, lang=lang, server_url=server_url, callback=callback)
|
||||||
outputs = self._read_output(out_dir, pdf.stem, method=method, backend=backend)
|
outputs = self._read_output(out_dir, pdf.stem, method=method, backend=backend)
|
||||||
self.logger.info(f"[MinerU] Parsed {len(outputs)} blocks from PDF.")
|
self.logger.info(f"[MinerU] Parsed {len(outputs)} blocks from PDF.")
|
||||||
if callback:
|
if callback:
|
||||||
@ -492,7 +529,8 @@ class MinerUParser(RAGFlowPdfParser):
|
|||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
parser = MinerUParser("mineru")
|
parser = MinerUParser("mineru")
|
||||||
print("MinerU available:", parser.check_installation())
|
ok, reason = parser.check_installation()
|
||||||
|
print("MinerU available:", ok)
|
||||||
|
|
||||||
filepath = ""
|
filepath = ""
|
||||||
with open(filepath, "rb") as file:
|
with open(filepath, "rb") as file:
|
||||||
|
|||||||
31
docs/faq.mdx
31
docs/faq.mdx
@ -540,11 +540,38 @@ uv pip install -U "mineru[core]" -i https://mirrors.aliyun.com/pypi/simple
|
|||||||
|
|
||||||
### How to configure MinerU-specific settings?
|
### How to configure MinerU-specific settings?
|
||||||
|
|
||||||
1. Set `MINERU_EXECUTABLE` (default: `mineru`) to the path of the MinerU executable.
|
1. Set `MINERU_EXECUTABLE` (default: `mineru`) to the path to the MinerU executable.
|
||||||
2. Set `MINERU_DELETE_OUTPUT` to `0` to keep MinerU's output. (Default: `1`, which deletes temporary output)
|
2. Set `MINERU_DELETE_OUTPUT` to `0` to keep MinerU's output. (Default: `1`, which deletes temporary output)
|
||||||
3. Set `MINERU_OUTPUT_DIR` to specify the output directory for MinerU.
|
3. Set `MINERU_OUTPUT_DIR` to specify the output directory for MinerU.
|
||||||
4. Set `MINERU_BACKEND` to `"pipeline"`. (Options: `"pipeline"` (default) | `"vlm-transformers"`)
|
4. Set `MINERU_BACKEND` to specify a parsing backend:
|
||||||
|
- `"pipeline"` (default): The traditional multimodel pipeline.
|
||||||
|
- `"vlm-transformers"`: A vision-language model using HuggingFace Transformers.
|
||||||
|
- `"vlm-vllm-engine"`: A vision-language model using local vLLM engine (requires a local GPU).
|
||||||
|
- `"vlm-http-client"`: A vision-language model via HTTP client to remote vLLM server (RAGFlow only requires CPU).
|
||||||
|
5. If using the `"vlm-http-client"` backend, you must also set `MINERU_SERVER_URL` to the URL of your vLLM server.
|
||||||
|
|
||||||
:::tip NOTE
|
:::tip NOTE
|
||||||
For information about other environment variables natively supported by MinerU, see [here](https://opendatalab.github.io/MinerU/usage/cli_tools/#environment-variables-description).
|
For information about other environment variables natively supported by MinerU, see [here](https://opendatalab.github.io/MinerU/usage/cli_tools/#environment-variables-description).
|
||||||
:::
|
:::
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### How to use MinerU with a vLLM server for document parsing?
|
||||||
|
|
||||||
|
RAGFlow supports MinerU's `vlm-http-client` backend, enabling you to delegate document parsing tasks to a remote vLLM server. With this configuration, RAGFlow will connect to your remote vLLM server as a client and use its powerful GPU resources for document parsing. This significantly improves performance for parsing complex documents while reducing the resources required on your RAGFlow server. To configure MinerU with a vLLM server:
|
||||||
|
|
||||||
|
1. Set up a vLLM server running MinerU:
|
||||||
|
```bash
|
||||||
|
mineru-vllm-server --port 30000
|
||||||
|
```
|
||||||
|
|
||||||
|
2. Configure the following environment variables in your **docker/.env** file:
|
||||||
|
- `MINERU_EXECUTABLE=/ragflow/uv_tools/.venv/bin/mineru` (or the path to your MinerU executable)
|
||||||
|
- `MINERU_BACKEND="vlm-http-client"`
|
||||||
|
- `MINERU_SERVER_URL="http://your-vllm-server-ip:30000"`
|
||||||
|
|
||||||
|
3. Complete the rest standard MinerU setup steps as described [here](#how-to-configure-mineru-specific-settings).
|
||||||
|
|
||||||
|
:::tip NOTE
|
||||||
|
When using the `vlm-http-client` backend, the RAGFlow server requires no GPU, only network connectivity. This enables cost-effective distributed deployment with multiple RAGFlow instances sharing one remote vLLM server.
|
||||||
|
:::
|
||||||
|
|||||||
@ -548,9 +548,12 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
|||||||
elif layout_recognizer == "MinerU":
|
elif layout_recognizer == "MinerU":
|
||||||
mineru_executable = os.environ.get("MINERU_EXECUTABLE", "mineru")
|
mineru_executable = os.environ.get("MINERU_EXECUTABLE", "mineru")
|
||||||
mineru_api = os.environ.get("MINERU_APISERVER", "http://host.docker.internal:9987")
|
mineru_api = os.environ.get("MINERU_APISERVER", "http://host.docker.internal:9987")
|
||||||
pdf_parser = MinerUParser(mineru_path=mineru_executable, mineru_api=mineru_api)
|
mineru_server_url = os.environ.get("MINERU_SERVER_URL", "")
|
||||||
if not pdf_parser.check_installation():
|
mineru_backend = os.environ.get("MINERU_BACKEND", "pipeline")
|
||||||
callback(-1, "MinerU not found.")
|
pdf_parser = MinerUParser(mineru_path=mineru_executable, mineru_api=mineru_api, mineru_server_url=mineru_server_url)
|
||||||
|
ok, reason = pdf_parser.check_installation(backend=mineru_backend)
|
||||||
|
if not ok:
|
||||||
|
callback(-1, f"MinerU not found or server not accessible: {reason}")
|
||||||
return res
|
return res
|
||||||
|
|
||||||
sections, tables = pdf_parser.parse_pdf(
|
sections, tables = pdf_parser.parse_pdf(
|
||||||
@ -558,7 +561,8 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
|||||||
binary=binary,
|
binary=binary,
|
||||||
callback=callback,
|
callback=callback,
|
||||||
output_dir=os.environ.get("MINERU_OUTPUT_DIR", ""),
|
output_dir=os.environ.get("MINERU_OUTPUT_DIR", ""),
|
||||||
backend=os.environ.get("MINERU_BACKEND", "pipeline"),
|
backend=mineru_backend,
|
||||||
|
server_url=mineru_server_url,
|
||||||
delete_output=bool(int(os.environ.get("MINERU_DELETE_OUTPUT", 1))),
|
delete_output=bool(int(os.environ.get("MINERU_DELETE_OUTPUT", 1))),
|
||||||
)
|
)
|
||||||
parser_config["chunk_token_num"] = 0
|
parser_config["chunk_token_num"] = 0
|
||||||
|
|||||||
@ -224,8 +224,9 @@ class Parser(ProcessBase):
|
|||||||
mineru_executable = os.environ.get("MINERU_EXECUTABLE", "mineru")
|
mineru_executable = os.environ.get("MINERU_EXECUTABLE", "mineru")
|
||||||
mineru_api = os.environ.get("MINERU_APISERVER", "http://host.docker.internal:9987")
|
mineru_api = os.environ.get("MINERU_APISERVER", "http://host.docker.internal:9987")
|
||||||
pdf_parser = MinerUParser(mineru_path=mineru_executable, mineru_api=mineru_api)
|
pdf_parser = MinerUParser(mineru_path=mineru_executable, mineru_api=mineru_api)
|
||||||
if not pdf_parser.check_installation():
|
ok, reason = pdf_parser.check_installation()
|
||||||
raise RuntimeError("MinerU not found. Please install it via: pip install -U 'mineru[core]'.")
|
if not ok:
|
||||||
|
raise RuntimeError(f"MinerU not found or server not accessible: {reason}. Please install it via: pip install -U 'mineru[core]'.")
|
||||||
|
|
||||||
lines, _ = pdf_parser.parse_pdf(
|
lines, _ = pdf_parser.parse_pdf(
|
||||||
filepath=name,
|
filepath=name,
|
||||||
|
|||||||
Reference in New Issue
Block a user