mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-01-30 15:16:45 +08:00
feat: PaddleOCR PDF parser supports thumnails and positions (#12565)
### What problem does this PR solve? 1. PaddleOCR PDF parser supports thumnails and positions. 2. Add FAQ documentation for PaddleOCR PDF parser. ### Type of change - [x] New Feature (non-breaking change which adds functionality)
This commit is contained in:
@ -24,7 +24,10 @@ from os import PathLike
|
||||
from pathlib import Path
|
||||
from typing import Any, Callable, ClassVar, Literal, Optional, Union, Tuple, List
|
||||
|
||||
import numpy as np
|
||||
import pdfplumber
|
||||
import requests
|
||||
from PIL import Image
|
||||
|
||||
try:
|
||||
from deepdoc.parser.pdf_parser import RAGFlowPdfParser
|
||||
@ -60,8 +63,8 @@ def _remove_images_from_markdown(markdown: str) -> str:
|
||||
class PaddleOCRVLConfig:
|
||||
"""Configuration for PaddleOCR-VL algorithm."""
|
||||
|
||||
use_doc_orientation_classify: Optional[bool] = None
|
||||
use_doc_unwarping: Optional[bool] = None
|
||||
use_doc_orientation_classify: Optional[bool] = False
|
||||
use_doc_unwarping: Optional[bool] = False
|
||||
use_layout_detection: Optional[bool] = None
|
||||
use_polygon_points: Optional[bool] = None
|
||||
use_chart_recognition: Optional[bool] = None
|
||||
@ -79,7 +82,7 @@ class PaddleOCRVLConfig:
|
||||
min_pixels: Optional[int] = None
|
||||
max_pixels: Optional[int] = None
|
||||
max_new_tokens: Optional[int] = None
|
||||
merge_layout_blocks: Optional[bool] = None
|
||||
merge_layout_blocks: Optional[bool] = False
|
||||
markdown_ignore_labels: Optional[List[str]] = None
|
||||
vlm_extra_args: Optional[dict] = None
|
||||
|
||||
@ -116,14 +119,12 @@ class PaddleOCRConfig:
|
||||
if algorithm == "PaddleOCR-VL":
|
||||
# Create default PaddleOCRVLConfig object and convert to dict
|
||||
algorithm_config = asdict(PaddleOCRVLConfig())
|
||||
|
||||
# Apply user-provided VL config
|
||||
vl_config = cfg.get("vl")
|
||||
if isinstance(vl_config, dict):
|
||||
algorithm_config.update({k: v for k, v in vl_config.items() if v is not None})
|
||||
algorithm_config_user = cfg.get("algorithm_config")
|
||||
if isinstance(algorithm_config_user, dict):
|
||||
algorithm_config.update({k: v for k, v in algorithm_config_user.items() if v is not None})
|
||||
|
||||
# Remove processed keys
|
||||
cfg.pop("vl", None)
|
||||
cfg.pop("algorithm_config", None)
|
||||
|
||||
# Prepare initialization arguments
|
||||
field_names = {field.name for field in fields(cls)}
|
||||
@ -146,6 +147,8 @@ class PaddleOCRConfig:
|
||||
class PaddleOCRParser(RAGFlowPdfParser):
|
||||
"""Parser for PDF documents using PaddleOCR API."""
|
||||
|
||||
_ZOOMIN = 2
|
||||
|
||||
_COMMON_FIELD_MAPPING: ClassVar[dict[str, str]] = {
|
||||
"prettify_markdown": "prettifyMarkdown",
|
||||
"show_formula_number": "showFormulaNumber",
|
||||
@ -188,6 +191,8 @@ class PaddleOCRParser(RAGFlowPdfParser):
|
||||
request_timeout: int = 600,
|
||||
):
|
||||
"""Initialize PaddleOCR parser."""
|
||||
super().__init__()
|
||||
|
||||
self.api_url = api_url.rstrip("/") if api_url else os.getenv("PADDLEOCR_API_URL", "")
|
||||
self.access_token = access_token or os.getenv("PADDLEOCR_ACCESS_TOKEN")
|
||||
self.algorithm = algorithm
|
||||
@ -197,6 +202,10 @@ class PaddleOCRParser(RAGFlowPdfParser):
|
||||
# Force PDF file type
|
||||
self.file_type = 0
|
||||
|
||||
# Initialize page images for cropping
|
||||
self.page_images: list[Image.Image] = []
|
||||
self.page_from = 0
|
||||
|
||||
# Public methods
|
||||
def check_installation(self) -> tuple[bool, str]:
|
||||
"""Check if the parser is properly installed and configured."""
|
||||
@ -222,7 +231,7 @@ class PaddleOCRParser(RAGFlowPdfParser):
|
||||
show_formula_number: Optional[bool] = None,
|
||||
visualize: Optional[bool] = None,
|
||||
additional_params: Optional[dict[str, Any]] = None,
|
||||
vl_config: Optional[dict[str, Any]] = None,
|
||||
algorithm_config: Optional[dict[str, Any]] = None,
|
||||
**kwargs: Any,
|
||||
) -> ParseResult:
|
||||
"""Parse PDF document using PaddleOCR API."""
|
||||
@ -241,22 +250,24 @@ class PaddleOCRParser(RAGFlowPdfParser):
|
||||
config_dict["visualize"] = visualize
|
||||
if additional_params is not None:
|
||||
config_dict["additional_params"] = additional_params
|
||||
if vl_config is not None:
|
||||
config_dict["vl"] = vl_config
|
||||
|
||||
# Add any VL config parameters from kwargs
|
||||
for key, value in kwargs.items():
|
||||
if key in {field.name for field in fields(PaddleOCRVLConfig)}:
|
||||
config_dict[key] = value
|
||||
if algorithm_config is not None:
|
||||
config_dict["algorithm_config"] = algorithm_config
|
||||
|
||||
cfg = PaddleOCRConfig.from_dict(config_dict)
|
||||
|
||||
if not cfg.api_url:
|
||||
raise RuntimeError("[PaddleOCR] API URL missing")
|
||||
|
||||
# Prepare file data
|
||||
# Prepare file data and generate page images for cropping
|
||||
data_bytes = self._prepare_file_data(filepath, binary)
|
||||
|
||||
# Generate page images for cropping functionality
|
||||
input_source = filepath if binary is None else binary
|
||||
try:
|
||||
self.__images__(input_source, callback=callback)
|
||||
except Exception as e:
|
||||
self.logger.warning(f"[PaddleOCR] Failed to generate page images for cropping: {e}")
|
||||
|
||||
# Build and send request
|
||||
result = self._send_request(data_bytes, cfg, callback)
|
||||
|
||||
@ -377,7 +388,7 @@ class PaddleOCRParser(RAGFlowPdfParser):
|
||||
label = block.get("block_label", "")
|
||||
block_bbox = block.get("block_bbox", [0, 0, 0, 0])
|
||||
|
||||
tag = f"@@{page_idx + 1}\t{block_bbox[0]}\t{block_bbox[2]}\t{block_bbox[1]}\t{block_bbox[3]}##"
|
||||
tag = f"@@{page_idx + 1}\t{block_bbox[0] // self._ZOOMIN}\t{block_bbox[2] // self._ZOOMIN}\t{block_bbox[1] // self._ZOOMIN}\t{block_bbox[3] // self._ZOOMIN}##"
|
||||
|
||||
if parse_method == "manual":
|
||||
sections.append((block_content, label, tag))
|
||||
@ -392,6 +403,149 @@ class PaddleOCRParser(RAGFlowPdfParser):
|
||||
"""Convert API response to table tuples."""
|
||||
return []
|
||||
|
||||
def __images__(self, fnm, page_from=0, page_to=100, callback=None):
|
||||
"""Generate page images from PDF for cropping."""
|
||||
self.page_from = page_from
|
||||
self.page_to = page_to
|
||||
try:
|
||||
with pdfplumber.open(fnm) if isinstance(fnm, (str, PathLike)) else pdfplumber.open(BytesIO(fnm)) as pdf:
|
||||
self.pdf = pdf
|
||||
self.page_images = [p.to_image(resolution=72, antialias=True).original for i, p in enumerate(self.pdf.pages[page_from:page_to])]
|
||||
except Exception as e:
|
||||
self.page_images = None
|
||||
self.logger.exception(e)
|
||||
|
||||
@staticmethod
|
||||
def extract_positions(txt: str):
|
||||
"""Extract position information from text tags."""
|
||||
poss = []
|
||||
for tag in re.findall(r"@@[0-9-]+\t[0-9.\t]+##", txt):
|
||||
pn, left, right, top, bottom = tag.strip("#").strip("@").split("\t")
|
||||
left, right, top, bottom = float(left), float(right), float(top), float(bottom)
|
||||
poss.append(([int(p) - 1 for p in pn.split("-")], left, right, top, bottom))
|
||||
return poss
|
||||
|
||||
def crop(self, text: str, need_position: bool = False):
|
||||
"""Crop images from PDF based on position tags in text."""
|
||||
imgs = []
|
||||
poss = self.extract_positions(text)
|
||||
|
||||
if not poss:
|
||||
if need_position:
|
||||
return None, None
|
||||
return
|
||||
|
||||
if not getattr(self, "page_images", None):
|
||||
self.logger.warning("[PaddleOCR] crop called without page images; skipping image generation.")
|
||||
if need_position:
|
||||
return None, None
|
||||
return
|
||||
|
||||
page_count = len(self.page_images)
|
||||
|
||||
filtered_poss = []
|
||||
for pns, left, right, top, bottom in poss:
|
||||
if not pns:
|
||||
self.logger.warning("[PaddleOCR] Empty page index list in crop; skipping this position.")
|
||||
continue
|
||||
valid_pns = [p for p in pns if 0 <= p < page_count]
|
||||
if not valid_pns:
|
||||
self.logger.warning(f"[PaddleOCR] All page indices {pns} out of range for {page_count} pages; skipping.")
|
||||
continue
|
||||
filtered_poss.append((valid_pns, left, right, top, bottom))
|
||||
|
||||
poss = filtered_poss
|
||||
if not poss:
|
||||
self.logger.warning("[PaddleOCR] No valid positions after filtering; skip cropping.")
|
||||
if need_position:
|
||||
return None, None
|
||||
return
|
||||
|
||||
max_width = max(np.max([right - left for (_, left, right, _, _) in poss]), 6)
|
||||
GAP = 6
|
||||
pos = poss[0]
|
||||
first_page_idx = pos[0][0]
|
||||
poss.insert(0, ([first_page_idx], pos[1], pos[2], max(0, pos[3] - 120), max(pos[3] - GAP, 0)))
|
||||
pos = poss[-1]
|
||||
last_page_idx = pos[0][-1]
|
||||
if not (0 <= last_page_idx < page_count):
|
||||
self.logger.warning(f"[PaddleOCR] Last page index {last_page_idx} out of range for {page_count} pages; skipping crop.")
|
||||
if need_position:
|
||||
return None, None
|
||||
return
|
||||
last_page_height = self.page_images[last_page_idx].size[1]
|
||||
poss.append(
|
||||
(
|
||||
[last_page_idx],
|
||||
pos[1],
|
||||
pos[2],
|
||||
min(last_page_height, pos[4] + GAP),
|
||||
min(last_page_height, pos[4] + 120),
|
||||
)
|
||||
)
|
||||
|
||||
positions = []
|
||||
for ii, (pns, left, right, top, bottom) in enumerate(poss):
|
||||
right = left + max_width
|
||||
|
||||
if bottom <= top:
|
||||
bottom = top + 2
|
||||
|
||||
for pn in pns[1:]:
|
||||
if 0 <= pn - 1 < page_count:
|
||||
bottom += self.page_images[pn - 1].size[1]
|
||||
else:
|
||||
self.logger.warning(f"[PaddleOCR] Page index {pn}-1 out of range for {page_count} pages during crop; skipping height accumulation.")
|
||||
|
||||
if not (0 <= pns[0] < page_count):
|
||||
self.logger.warning(f"[PaddleOCR] Base page index {pns[0]} out of range for {page_count} pages during crop; skipping this segment.")
|
||||
continue
|
||||
|
||||
img0 = self.page_images[pns[0]]
|
||||
x0, y0, x1, y1 = int(left), int(top), int(right), int(min(bottom, img0.size[1]))
|
||||
crop0 = img0.crop((x0, y0, x1, y1))
|
||||
imgs.append(crop0)
|
||||
if 0 < ii < len(poss) - 1:
|
||||
positions.append((pns[0] + self.page_from, x0, x1, y0, y1))
|
||||
|
||||
bottom -= img0.size[1]
|
||||
for pn in pns[1:]:
|
||||
if not (0 <= pn < page_count):
|
||||
self.logger.warning(f"[PaddleOCR] Page index {pn} out of range for {page_count} pages during crop; skipping this page.")
|
||||
continue
|
||||
page = self.page_images[pn]
|
||||
x0, y0, x1, y1 = int(left), 0, int(right), int(min(bottom, page.size[1]))
|
||||
cimgp = page.crop((x0, y0, x1, y1))
|
||||
imgs.append(cimgp)
|
||||
if 0 < ii < len(poss) - 1:
|
||||
positions.append((pn + self.page_from, x0, x1, y0, y1))
|
||||
bottom -= page.size[1]
|
||||
|
||||
if not imgs:
|
||||
if need_position:
|
||||
return None, None
|
||||
return
|
||||
|
||||
height = 0
|
||||
for img in imgs:
|
||||
height += img.size[1] + GAP
|
||||
height = int(height)
|
||||
width = int(np.max([i.size[0] for i in imgs]))
|
||||
pic = Image.new("RGB", (width, height), (245, 245, 245))
|
||||
height = 0
|
||||
for ii, img in enumerate(imgs):
|
||||
if ii == 0 or ii + 1 == len(imgs):
|
||||
img = img.convert("RGBA")
|
||||
overlay = Image.new("RGBA", img.size, (0, 0, 0, 0))
|
||||
overlay.putalpha(128)
|
||||
img = Image.alpha_composite(img, overlay).convert("RGB")
|
||||
pic.paste(img, (0, int(height)))
|
||||
height += img.size[1] + GAP
|
||||
|
||||
if need_position:
|
||||
return pic, positions
|
||||
return pic
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
|
||||
Reference in New Issue
Block a user