mirror of
https://github.com/infiniflow/ragflow.git
synced 2025-12-08 20:42:30 +08:00
### What problem does this PR solve? Add support for MinerU http-client/server method. To use MinerU with vLLM server: 1. Set up a vLLM server running MinerU: ```bash mineru-vllm-server --port 30000 ``` 2. Configure the following environment variables: - `MINERU_EXECUTABLE=/ragflow/uv_tools/.venv/bin/mineru` (or the path to your MinerU executable) - `MINERU_BACKEND="vlm-http-client"` - `MINERU_SERVER_URL="http://your-vllm-server-ip:30000"` 3. Follow the standard MinerU setup steps as described above. With this configuration, RAGFlow will connect to your vLLM server to perform document parsing, which can significantly improve parsing performance for complex documents while reducing the resource requirements on your RAGFlow server.   ### Type of change - [x] New Feature (non-breaking change which adds functionality) - [x] Documentation Update --------- Co-authored-by: writinwaters <cai.keith@gmail.com>
540 lines
22 KiB
Python
540 lines
22 KiB
Python
#
|
||
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
|
||
#
|
||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||
# you may not use this file except in compliance with the License.
|
||
# You may obtain a copy of the License at
|
||
#
|
||
# http://www.apache.org/licenses/LICENSE-2.0
|
||
#
|
||
# Unless required by applicable law or agreed to in writing, software
|
||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||
# See the License for the specific language governing permissions and
|
||
# limitations under the License.
|
||
#
|
||
import json
|
||
import logging
|
||
import os
|
||
import platform
|
||
import re
|
||
import subprocess
|
||
import sys
|
||
import tempfile
|
||
import threading
|
||
import time
|
||
import zipfile
|
||
from io import BytesIO
|
||
from os import PathLike
|
||
from pathlib import Path
|
||
from queue import Empty, Queue
|
||
from typing import Any, Callable, Optional
|
||
|
||
import numpy as np
|
||
import pdfplumber
|
||
import requests
|
||
from PIL import Image
|
||
from strenum import StrEnum
|
||
|
||
from deepdoc.parser.pdf_parser import RAGFlowPdfParser
|
||
|
||
LOCK_KEY_pdfplumber = "global_shared_lock_pdfplumber"
|
||
if LOCK_KEY_pdfplumber not in sys.modules:
|
||
sys.modules[LOCK_KEY_pdfplumber] = threading.Lock()
|
||
|
||
|
||
class MinerUContentType(StrEnum):
|
||
IMAGE = "image"
|
||
TABLE = "table"
|
||
TEXT = "text"
|
||
EQUATION = "equation"
|
||
CODE = "code"
|
||
LIST = "list"
|
||
DISCARDED = "discarded"
|
||
|
||
|
||
class MinerUParser(RAGFlowPdfParser):
|
||
def __init__(self, mineru_path: str = "mineru", mineru_api: str = "http://host.docker.internal:9987", mineru_server_url: str = ""):
|
||
self.mineru_path = Path(mineru_path)
|
||
self.mineru_api = mineru_api.rstrip("/")
|
||
self.mineru_server_url = mineru_server_url.rstrip("/")
|
||
self.using_api = False
|
||
self.logger = logging.getLogger(self.__class__.__name__)
|
||
|
||
def _extract_zip_no_root(self, zip_path, extract_to, root_dir):
|
||
with zipfile.ZipFile(zip_path, "r") as zip_ref:
|
||
if not root_dir:
|
||
files = zip_ref.namelist()
|
||
if files and files[0].endswith("/"):
|
||
root_dir = files[0]
|
||
else:
|
||
root_dir = None
|
||
|
||
if not root_dir or not root_dir.endswith("/"):
|
||
self.logger.info(f"[MinerU] No root directory found, extracting all...fff{root_dir}")
|
||
zip_ref.extractall(extract_to)
|
||
return
|
||
|
||
root_len = len(root_dir)
|
||
for member in zip_ref.infolist():
|
||
filename = member.filename
|
||
if filename == root_dir:
|
||
self.logger.info("[MinerU] Ignore root folder...")
|
||
continue
|
||
|
||
path = filename
|
||
if path.startswith(root_dir):
|
||
path = path[root_len:]
|
||
|
||
full_path = os.path.join(extract_to, path)
|
||
if member.is_dir():
|
||
os.makedirs(full_path, exist_ok=True)
|
||
else:
|
||
os.makedirs(os.path.dirname(full_path), exist_ok=True)
|
||
with open(full_path, "wb") as f:
|
||
f.write(zip_ref.read(filename))
|
||
|
||
def _is_http_endpoint_valid(self, url, timeout=5):
|
||
try:
|
||
response = requests.head(url, timeout=timeout, allow_redirects=True)
|
||
return response.status_code in [200, 301, 302, 307, 308]
|
||
except Exception:
|
||
return False
|
||
|
||
def check_installation(self, backend: str = "pipeline", server_url: Optional[str] = None) -> tuple[bool, str]:
|
||
reason = ""
|
||
|
||
valid_backends = ["pipeline", "vlm-http-client", "vlm-transformers", "vlm-vllm-engine"]
|
||
if backend not in valid_backends:
|
||
reason = "[MinerU] Invalid backend '{backend}'. Valid backends are: {valid_backends}"
|
||
logging.warning(reason)
|
||
return False, reason
|
||
|
||
subprocess_kwargs = {
|
||
"capture_output": True,
|
||
"text": True,
|
||
"check": True,
|
||
"encoding": "utf-8",
|
||
"errors": "ignore",
|
||
}
|
||
|
||
if platform.system() == "Windows":
|
||
subprocess_kwargs["creationflags"] = getattr(subprocess, "CREATE_NO_WINDOW", 0)
|
||
|
||
if server_url is None:
|
||
server_url = self.mineru_server_url
|
||
|
||
if backend == "vlm-http-client" and server_url:
|
||
try:
|
||
server_accessible = self._is_http_endpoint_valid(server_url + "/openapi.json")
|
||
logging.info(f"[MinerU] vlm-http-client server check: {server_accessible}")
|
||
if server_accessible:
|
||
self.using_api = False # We are using http client, not API
|
||
return True, reason
|
||
else:
|
||
reason = f"[MinerU] vlm-http-client server not accessible: {server_url}"
|
||
logging.warning(f"[MinerU] vlm-http-client server not accessible: {server_url}")
|
||
return False, reason
|
||
except Exception as e:
|
||
logging.warning(f"[MinerU] vlm-http-client server check failed: {e}")
|
||
try:
|
||
response = requests.get(server_url, timeout=5)
|
||
logging.info(f"[MinerU] vlm-http-client server connection check: success with status {response.status_code}")
|
||
self.using_api = False
|
||
return True, reason
|
||
except Exception as e:
|
||
reason = f"[MinerU] vlm-http-client server connection check failed: {server_url}: {e}"
|
||
logging.warning(f"[MinerU] vlm-http-client server connection check failed: {server_url}: {e}")
|
||
return False, reason
|
||
|
||
try:
|
||
result = subprocess.run([str(self.mineru_path), "--version"], **subprocess_kwargs)
|
||
version_info = result.stdout.strip()
|
||
if version_info:
|
||
logging.info(f"[MinerU] Detected version: {version_info}")
|
||
else:
|
||
logging.info("[MinerU] Detected MinerU, but version info is empty.")
|
||
return True, reason
|
||
except subprocess.CalledProcessError as e:
|
||
logging.warning(f"[MinerU] Execution failed (exit code {e.returncode}).")
|
||
except FileNotFoundError:
|
||
logging.warning("[MinerU] MinerU not found. Please install it via: pip install -U 'mineru[core]'")
|
||
except Exception as e:
|
||
logging.error(f"[MinerU] Unexpected error during installation check: {e}")
|
||
|
||
# If executable check fails, try API check
|
||
try:
|
||
if self.mineru_api:
|
||
# check openapi.json
|
||
openapi_exists = self._is_http_endpoint_valid(self.mineru_api + "/openapi.json")
|
||
if not openapi_exists:
|
||
reason = "[MinerU] Failed to detect vaild MinerU API server"
|
||
return openapi_exists, reason
|
||
logging.info(f"[MinerU] Detected {self.mineru_api}/openapi.json: {openapi_exists}")
|
||
self.using_api = openapi_exists
|
||
return openapi_exists, reason
|
||
else:
|
||
logging.info("[MinerU] api not exists.")
|
||
except Exception as e:
|
||
reason = f"[MinerU] Unexpected error during api check: {e}"
|
||
logging.error(f"[MinerU] Unexpected error during api check: {e}")
|
||
return False, reason
|
||
|
||
def _run_mineru(
|
||
self, input_path: Path, output_dir: Path, method: str = "auto", backend: str = "pipeline", lang: Optional[str] = None, server_url: Optional[str] = None, callback: Optional[Callable] = None
|
||
):
|
||
if self.using_api:
|
||
self._run_mineru_api(input_path, output_dir, method, backend, lang, callback)
|
||
else:
|
||
self._run_mineru_executable(input_path, output_dir, method, backend, lang, server_url, callback)
|
||
|
||
def _run_mineru_api(self, input_path: Path, output_dir: Path, method: str = "auto", backend: str = "pipeline", lang: Optional[str] = None, callback: Optional[Callable] = None):
|
||
OUTPUT_ZIP_PATH = os.path.join(str(output_dir), "output.zip")
|
||
|
||
pdf_file_path = str(input_path)
|
||
|
||
if not os.path.exists(pdf_file_path):
|
||
raise RuntimeError(f"[MinerU] PDF file not exists: {pdf_file_path}")
|
||
|
||
pdf_file_name = Path(pdf_file_path).stem.strip()
|
||
output_path = os.path.join(str(output_dir), pdf_file_name, method)
|
||
os.makedirs(output_path, exist_ok=True)
|
||
|
||
files = {"files": (pdf_file_name + ".pdf", open(pdf_file_path, "rb"), "application/pdf")}
|
||
|
||
data = {
|
||
"output_dir": "./output",
|
||
"lang_list": lang,
|
||
"backend": backend,
|
||
"parse_method": method,
|
||
"formula_enable": True,
|
||
"table_enable": True,
|
||
"server_url": None,
|
||
"return_md": True,
|
||
"return_middle_json": True,
|
||
"return_model_output": True,
|
||
"return_content_list": True,
|
||
"return_images": True,
|
||
"response_format_zip": True,
|
||
"start_page_id": 0,
|
||
"end_page_id": 99999,
|
||
}
|
||
|
||
headers = {"Accept": "application/json"}
|
||
try:
|
||
self.logger.info(f"[MinerU] invoke api: {self.mineru_api}/file_parse")
|
||
if callback:
|
||
callback(0.20, f"[MinerU] invoke api: {self.mineru_api}/file_parse")
|
||
response = requests.post(url=f"{self.mineru_api}/file_parse", files=files, data=data, headers=headers, timeout=1800)
|
||
|
||
response.raise_for_status()
|
||
if response.headers.get("Content-Type") == "application/zip":
|
||
self.logger.info(f"[MinerU] zip file returned, saving to {OUTPUT_ZIP_PATH}...")
|
||
|
||
if callback:
|
||
callback(0.30, f"[MinerU] zip file returned, saving to {OUTPUT_ZIP_PATH}...")
|
||
|
||
with open(OUTPUT_ZIP_PATH, "wb") as f:
|
||
f.write(response.content)
|
||
|
||
self.logger.info(f"[MinerU] Unzip to {output_path}...")
|
||
self._extract_zip_no_root(OUTPUT_ZIP_PATH, output_path, pdf_file_name + "/")
|
||
|
||
if callback:
|
||
callback(0.40, f"[MinerU] Unzip to {output_path}...")
|
||
else:
|
||
self.logger.warning("[MinerU] not zip returned from api:%s " % response.headers.get("Content-Type"))
|
||
except Exception as e:
|
||
raise RuntimeError(f"[MinerU] api failed with exception {e}")
|
||
self.logger.info("[MinerU] Api completed successfully.")
|
||
|
||
def _run_mineru_executable(
|
||
self, input_path: Path, output_dir: Path, method: str = "auto", backend: str = "pipeline", lang: Optional[str] = None, server_url: Optional[str] = None, callback: Optional[Callable] = None
|
||
):
|
||
cmd = [str(self.mineru_path), "-p", str(input_path), "-o", str(output_dir), "-m", method]
|
||
if backend:
|
||
cmd.extend(["-b", backend])
|
||
if lang:
|
||
cmd.extend(["-l", lang])
|
||
if server_url and backend == "vlm-http-client":
|
||
cmd.extend(["-u", server_url])
|
||
|
||
self.logger.info(f"[MinerU] Running command: {' '.join(cmd)}")
|
||
|
||
subprocess_kwargs = {
|
||
"stdout": subprocess.PIPE,
|
||
"stderr": subprocess.PIPE,
|
||
"text": True,
|
||
"encoding": "utf-8",
|
||
"errors": "ignore",
|
||
"bufsize": 1,
|
||
}
|
||
|
||
if platform.system() == "Windows":
|
||
subprocess_kwargs["creationflags"] = getattr(subprocess, "CREATE_NO_WINDOW", 0)
|
||
|
||
process = subprocess.Popen(cmd, **subprocess_kwargs)
|
||
stdout_queue, stderr_queue = Queue(), Queue()
|
||
|
||
def enqueue_output(pipe, queue, prefix):
|
||
for line in iter(pipe.readline, ""):
|
||
if line.strip():
|
||
queue.put((prefix, line.strip()))
|
||
pipe.close()
|
||
|
||
threading.Thread(target=enqueue_output, args=(process.stdout, stdout_queue, "STDOUT"), daemon=True).start()
|
||
threading.Thread(target=enqueue_output, args=(process.stderr, stderr_queue, "STDERR"), daemon=True).start()
|
||
|
||
while process.poll() is None:
|
||
for q in (stdout_queue, stderr_queue):
|
||
try:
|
||
while True:
|
||
prefix, line = q.get_nowait()
|
||
if prefix == "STDOUT":
|
||
self.logger.info(f"[MinerU] {line}")
|
||
else:
|
||
self.logger.warning(f"[MinerU] {line}")
|
||
except Empty:
|
||
pass
|
||
time.sleep(0.1)
|
||
|
||
return_code = process.wait()
|
||
if return_code != 0:
|
||
raise RuntimeError(f"[MinerU] Process failed with exit code {return_code}")
|
||
self.logger.info("[MinerU] Command completed successfully.")
|
||
|
||
def __images__(self, fnm, zoomin: int = 1, page_from=0, page_to=600, callback=None):
|
||
self.page_from = page_from
|
||
self.page_to = page_to
|
||
try:
|
||
with pdfplumber.open(fnm) if isinstance(fnm, (str, PathLike)) else pdfplumber.open(BytesIO(fnm)) as pdf:
|
||
self.pdf = pdf
|
||
self.page_images = [p.to_image(resolution=72 * zoomin, antialias=True).original for _, p in enumerate(self.pdf.pages[page_from:page_to])]
|
||
except Exception as e:
|
||
self.page_images = None
|
||
self.total_page = 0
|
||
logging.exception(e)
|
||
|
||
def _line_tag(self, bx):
|
||
pn = [bx["page_idx"] + 1]
|
||
positions = bx["bbox"]
|
||
x0, top, x1, bott = positions
|
||
|
||
if hasattr(self, "page_images") and self.page_images and len(self.page_images) > bx["page_idx"]:
|
||
page_width, page_height = self.page_images[bx["page_idx"]].size
|
||
x0 = (x0 / 1000.0) * page_width
|
||
x1 = (x1 / 1000.0) * page_width
|
||
top = (top / 1000.0) * page_height
|
||
bott = (bott / 1000.0) * page_height
|
||
|
||
return "@@{}\t{:.1f}\t{:.1f}\t{:.1f}\t{:.1f}##".format("-".join([str(p) for p in pn]), x0, x1, top, bott)
|
||
|
||
def crop(self, text, ZM=1, need_position=False):
|
||
imgs = []
|
||
poss = self.extract_positions(text)
|
||
if not poss:
|
||
if need_position:
|
||
return None, None
|
||
return
|
||
|
||
max_width = max(np.max([right - left for (_, left, right, _, _) in poss]), 6)
|
||
GAP = 6
|
||
pos = poss[0]
|
||
poss.insert(0, ([pos[0][0]], pos[1], pos[2], max(0, pos[3] - 120), max(pos[3] - GAP, 0)))
|
||
pos = poss[-1]
|
||
poss.append(([pos[0][-1]], pos[1], pos[2], min(self.page_images[pos[0][-1]].size[1], pos[4] + GAP), min(self.page_images[pos[0][-1]].size[1], pos[4] + 120)))
|
||
|
||
positions = []
|
||
for ii, (pns, left, right, top, bottom) in enumerate(poss):
|
||
right = left + max_width
|
||
|
||
if bottom <= top:
|
||
bottom = top + 2
|
||
|
||
for pn in pns[1:]:
|
||
bottom += self.page_images[pn - 1].size[1]
|
||
|
||
img0 = self.page_images[pns[0]]
|
||
x0, y0, x1, y1 = int(left), int(top), int(right), int(min(bottom, img0.size[1]))
|
||
crop0 = img0.crop((x0, y0, x1, y1))
|
||
imgs.append(crop0)
|
||
if 0 < ii < len(poss) - 1:
|
||
positions.append((pns[0] + self.page_from, x0, x1, y0, y1))
|
||
|
||
bottom -= img0.size[1]
|
||
for pn in pns[1:]:
|
||
page = self.page_images[pn]
|
||
x0, y0, x1, y1 = int(left), 0, int(right), int(min(bottom, page.size[1]))
|
||
cimgp = page.crop((x0, y0, x1, y1))
|
||
imgs.append(cimgp)
|
||
if 0 < ii < len(poss) - 1:
|
||
positions.append((pn + self.page_from, x0, x1, y0, y1))
|
||
bottom -= page.size[1]
|
||
|
||
if not imgs:
|
||
if need_position:
|
||
return None, None
|
||
return
|
||
|
||
height = 0
|
||
for img in imgs:
|
||
height += img.size[1] + GAP
|
||
height = int(height)
|
||
width = int(np.max([i.size[0] for i in imgs]))
|
||
pic = Image.new("RGB", (width, height), (245, 245, 245))
|
||
height = 0
|
||
for ii, img in enumerate(imgs):
|
||
if ii == 0 or ii + 1 == len(imgs):
|
||
img = img.convert("RGBA")
|
||
overlay = Image.new("RGBA", img.size, (0, 0, 0, 0))
|
||
overlay.putalpha(128)
|
||
img = Image.alpha_composite(img, overlay).convert("RGB")
|
||
pic.paste(img, (0, int(height)))
|
||
height += img.size[1] + GAP
|
||
|
||
if need_position:
|
||
return pic, positions
|
||
return pic
|
||
|
||
@staticmethod
|
||
def extract_positions(txt: str):
|
||
poss = []
|
||
for tag in re.findall(r"@@[0-9-]+\t[0-9.\t]+##", txt):
|
||
pn, left, right, top, bottom = tag.strip("#").strip("@").split("\t")
|
||
left, right, top, bottom = float(left), float(right), float(top), float(bottom)
|
||
poss.append(([int(p) - 1 for p in pn.split("-")], left, right, top, bottom))
|
||
return poss
|
||
|
||
def _read_output(self, output_dir: Path, file_stem: str, method: str = "auto", backend: str = "pipeline") -> list[dict[str, Any]]:
|
||
subdir = output_dir / file_stem / method
|
||
if backend.startswith("vlm-"):
|
||
subdir = output_dir / file_stem / "vlm"
|
||
json_file = subdir / f"{file_stem}_content_list.json"
|
||
|
||
if not json_file.exists():
|
||
raise FileNotFoundError(f"[MinerU] Missing output file: {json_file}")
|
||
|
||
with open(json_file, "r", encoding="utf-8") as f:
|
||
data = json.load(f)
|
||
|
||
for item in data:
|
||
for key in ("img_path", "table_img_path", "equation_img_path"):
|
||
if key in item and item[key]:
|
||
item[key] = str((subdir / item[key]).resolve())
|
||
return data
|
||
|
||
def _transfer_to_sections(self, outputs: list[dict[str, Any]]):
|
||
sections = []
|
||
for output in outputs:
|
||
match output["type"]:
|
||
case MinerUContentType.TEXT:
|
||
section = output["text"]
|
||
case MinerUContentType.TABLE:
|
||
section = output.get("table_body", "") + "\n".join(output.get("table_caption", [])) + "\n".join(output.get("table_footnote", []))
|
||
if not section.strip():
|
||
section = "FAILED TO PARSE TABLE"
|
||
case MinerUContentType.IMAGE:
|
||
section = "".join(output["image_caption"]) + "\n" + "".join(output["image_footnote"])
|
||
case MinerUContentType.EQUATION:
|
||
section = output["text"]
|
||
case MinerUContentType.CODE:
|
||
section = output["code_body"] + "\n".join(output.get("code_caption", []))
|
||
case MinerUContentType.LIST:
|
||
section = "\n".join(output.get("list_items", []))
|
||
case MinerUContentType.DISCARDED:
|
||
pass
|
||
|
||
if section:
|
||
sections.append((section, self._line_tag(output)))
|
||
return sections
|
||
|
||
def _transfer_to_tables(self, outputs: list[dict[str, Any]]):
|
||
return []
|
||
|
||
def parse_pdf(
|
||
self,
|
||
filepath: str | PathLike[str],
|
||
binary: BytesIO | bytes,
|
||
callback: Optional[Callable] = None,
|
||
*,
|
||
output_dir: Optional[str] = None,
|
||
backend: str = "pipeline",
|
||
lang: Optional[str] = None,
|
||
method: str = "auto",
|
||
server_url: Optional[str] = None,
|
||
delete_output: bool = True,
|
||
) -> tuple:
|
||
import shutil
|
||
|
||
temp_pdf = None
|
||
created_tmp_dir = False
|
||
|
||
# remove spaces, or mineru crash, and _read_output fail too
|
||
file_path = Path(filepath)
|
||
pdf_file_name = file_path.stem.replace(" ", "") + ".pdf"
|
||
pdf_file_path_valid = os.path.join(file_path.parent, pdf_file_name)
|
||
|
||
if binary:
|
||
temp_dir = Path(tempfile.mkdtemp(prefix="mineru_bin_pdf_"))
|
||
temp_pdf = temp_dir / pdf_file_name
|
||
with open(temp_pdf, "wb") as f:
|
||
f.write(binary)
|
||
pdf = temp_pdf
|
||
self.logger.info(f"[MinerU] Received binary PDF -> {temp_pdf}")
|
||
if callback:
|
||
callback(0.15, f"[MinerU] Received binary PDF -> {temp_pdf}")
|
||
else:
|
||
if pdf_file_path_valid != filepath:
|
||
self.logger.info(f"[MinerU] Remove all space in file name: {pdf_file_path_valid}")
|
||
shutil.move(filepath, pdf_file_path_valid)
|
||
pdf = Path(pdf_file_path_valid)
|
||
if not pdf.exists():
|
||
if callback:
|
||
callback(-1, f"[MinerU] PDF not found: {pdf}")
|
||
raise FileNotFoundError(f"[MinerU] PDF not found: {pdf}")
|
||
|
||
if output_dir:
|
||
out_dir = Path(output_dir)
|
||
out_dir.mkdir(parents=True, exist_ok=True)
|
||
else:
|
||
out_dir = Path(tempfile.mkdtemp(prefix="mineru_pdf_"))
|
||
created_tmp_dir = True
|
||
|
||
self.logger.info(f"[MinerU] Output directory: {out_dir}")
|
||
if callback:
|
||
callback(0.15, f"[MinerU] Output directory: {out_dir}")
|
||
|
||
self.__images__(pdf, zoomin=1)
|
||
|
||
try:
|
||
self._run_mineru(pdf, out_dir, method=method, backend=backend, lang=lang, server_url=server_url, callback=callback)
|
||
outputs = self._read_output(out_dir, pdf.stem, method=method, backend=backend)
|
||
self.logger.info(f"[MinerU] Parsed {len(outputs)} blocks from PDF.")
|
||
if callback:
|
||
callback(0.75, f"[MinerU] Parsed {len(outputs)} blocks from PDF.")
|
||
return self._transfer_to_sections(outputs), self._transfer_to_tables(outputs)
|
||
finally:
|
||
if temp_pdf and temp_pdf.exists():
|
||
try:
|
||
temp_pdf.unlink()
|
||
temp_pdf.parent.rmdir()
|
||
except Exception:
|
||
pass
|
||
if delete_output and created_tmp_dir and out_dir.exists():
|
||
try:
|
||
shutil.rmtree(out_dir)
|
||
except Exception:
|
||
pass
|
||
|
||
|
||
if __name__ == "__main__":
|
||
parser = MinerUParser("mineru")
|
||
ok, reason = parser.check_installation()
|
||
print("MinerU available:", ok)
|
||
|
||
filepath = ""
|
||
with open(filepath, "rb") as file:
|
||
outputs = parser.parse_pdf(filepath=filepath, binary=file.read())
|
||
for output in outputs:
|
||
print(output)
|