# Copyright 2026 The InfiniFlow Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # from __future__ import annotations import base64 import logging import os import re from dataclasses import asdict, dataclass, field, fields from io import BytesIO from os import PathLike from pathlib import Path from typing import Any, Callable, ClassVar, Literal, Optional, Union, Tuple, List import numpy as np import pdfplumber import requests from PIL import Image try: from deepdoc.parser.pdf_parser import RAGFlowPdfParser except Exception: class RAGFlowPdfParser: pass AlgorithmType = Literal["PaddleOCR-VL"] SectionTuple = tuple[str, ...] TableTuple = tuple[str, ...] ParseResult = tuple[list[SectionTuple], list[TableTuple]] _MARKDOWN_IMAGE_PATTERN = re.compile( r""" ]*>\s* ]*/>\s* | ]*/> """, re.IGNORECASE | re.VERBOSE | re.DOTALL, ) def _remove_images_from_markdown(markdown: str) -> str: return _MARKDOWN_IMAGE_PATTERN.sub("", markdown) @dataclass class PaddleOCRVLConfig: """Configuration for PaddleOCR-VL algorithm.""" use_doc_orientation_classify: Optional[bool] = False use_doc_unwarping: Optional[bool] = False use_layout_detection: Optional[bool] = None use_polygon_points: Optional[bool] = None use_chart_recognition: Optional[bool] = None use_seal_recognition: Optional[bool] = None use_ocr_for_image_block: Optional[bool] = None layout_threshold: Optional[Union[float, dict]] = None layout_nms: Optional[bool] = None layout_unclip_ratio: Optional[Union[float, Tuple[float, float], dict]] = None layout_merge_bboxes_mode: Optional[Union[str, dict]] = None prompt_label: Optional[str] = None format_block_content: Optional[bool] = True repetition_penalty: Optional[float] = None temperature: Optional[float] = None top_p: Optional[float] = None min_pixels: Optional[int] = None max_pixels: Optional[int] = None max_new_tokens: Optional[int] = None merge_layout_blocks: Optional[bool] = False markdown_ignore_labels: Optional[List[str]] = None vlm_extra_args: Optional[dict] = None @dataclass class PaddleOCRConfig: """Main configuration for PaddleOCR parser.""" api_url: str = "" access_token: Optional[str] = None algorithm: AlgorithmType = "PaddleOCR-VL" request_timeout: int = 600 prettify_markdown: bool = True show_formula_number: bool = True visualize: bool = False additional_params: dict[str, Any] = field(default_factory=dict) algorithm_config: dict[str, Any] = field(default_factory=dict) @classmethod def from_dict(cls, config: Optional[dict[str, Any]]) -> "PaddleOCRConfig": """Create configuration from dictionary.""" if not config: return cls() cfg = config.copy() algorithm = cfg.get("algorithm", "PaddleOCR-VL") # Validate algorithm if algorithm not in ("PaddleOCR-VL",): raise ValueError(f"Unsupported algorithm: {algorithm}") # Extract algorithm-specific configuration algorithm_config: dict[str, Any] = {} if algorithm == "PaddleOCR-VL": # Create default PaddleOCRVLConfig object and convert to dict algorithm_config = asdict(PaddleOCRVLConfig()) algorithm_config_user = cfg.get("algorithm_config") if isinstance(algorithm_config_user, dict): algorithm_config.update({k: v for k, v in algorithm_config_user.items() if v is not None}) # Remove processed keys cfg.pop("algorithm_config", None) # Prepare initialization arguments field_names = {field.name for field in fields(cls)} init_kwargs: dict[str, Any] = {} for field_name in field_names: if field_name in cfg: init_kwargs[field_name] = cfg[field_name] init_kwargs["algorithm_config"] = algorithm_config return cls(**init_kwargs) @classmethod def from_kwargs(cls, **kwargs: Any) -> "PaddleOCRConfig": """Create configuration from keyword arguments.""" return cls.from_dict(kwargs) class PaddleOCRParser(RAGFlowPdfParser): """Parser for PDF documents using PaddleOCR API.""" _ZOOMIN = 2 _COMMON_FIELD_MAPPING: ClassVar[dict[str, str]] = { "prettify_markdown": "prettifyMarkdown", "show_formula_number": "showFormulaNumber", "visualize": "visualize", } _ALGORITHM_FIELD_MAPPINGS: ClassVar[dict[str, dict[str, str]]] = { "PaddleOCR-VL": { "use_doc_orientation_classify": "useDocOrientationClassify", "use_doc_unwarping": "useDocUnwarping", "use_layout_detection": "useLayoutDetection", "use_polygon_points": "usePolygonPoints", "use_chart_recognition": "useChartRecognition", "use_seal_recognition": "useSealRecognition", "use_ocr_for_image_block": "useOcrForImageBlock", "layout_threshold": "layoutThreshold", "layout_nms": "layoutNms", "layout_unclip_ratio": "layoutUnclipRatio", "layout_merge_bboxes_mode": "layoutMergeBboxesMode", "prompt_label": "promptLabel", "format_block_content": "formatBlockContent", "repetition_penalty": "repetitionPenalty", "temperature": "temperature", "top_p": "topP", "min_pixels": "minPixels", "max_pixels": "maxPixels", "max_new_tokens": "maxNewTokens", "merge_layout_blocks": "mergeLayoutBlocks", "markdown_ignore_labels": "markdownIgnoreLabels", "vlm_extra_args": "vlmExtraArgs", }, } def __init__( self, api_url: Optional[str] = None, access_token: Optional[str] = None, algorithm: AlgorithmType = "PaddleOCR-VL", *, request_timeout: int = 600, ): """Initialize PaddleOCR parser.""" super().__init__() self.api_url = api_url.rstrip("/") if api_url else os.getenv("PADDLEOCR_API_URL", "") self.access_token = access_token or os.getenv("PADDLEOCR_ACCESS_TOKEN") self.algorithm = algorithm self.request_timeout = request_timeout self.logger = logging.getLogger(self.__class__.__name__) # Force PDF file type self.file_type = 0 # Initialize page images for cropping self.page_images: list[Image.Image] = [] self.page_from = 0 # Public methods def check_installation(self) -> tuple[bool, str]: """Check if the parser is properly installed and configured.""" if not self.api_url: return False, "[PaddleOCR] API URL not configured" # TODO [@Bobholamovic]: Check URL availability and token validity return True, "" def parse_pdf( self, filepath: str | PathLike[str], binary: BytesIO | bytes | None = None, callback: Optional[Callable[[float, str], None]] = None, *, parse_method: str = "raw", api_url: Optional[str] = None, access_token: Optional[str] = None, algorithm: Optional[AlgorithmType] = None, request_timeout: Optional[int] = None, prettify_markdown: Optional[bool] = None, show_formula_number: Optional[bool] = None, visualize: Optional[bool] = None, additional_params: Optional[dict[str, Any]] = None, algorithm_config: Optional[dict[str, Any]] = None, **kwargs: Any, ) -> ParseResult: """Parse PDF document using PaddleOCR API.""" # Create configuration - pass all kwargs to capture VL config parameters config_dict = { "api_url": api_url if api_url is not None else self.api_url, "access_token": access_token if access_token is not None else self.access_token, "algorithm": algorithm if algorithm is not None else self.algorithm, "request_timeout": request_timeout if request_timeout is not None else self.request_timeout, } if prettify_markdown is not None: config_dict["prettify_markdown"] = prettify_markdown if show_formula_number is not None: config_dict["show_formula_number"] = show_formula_number if visualize is not None: config_dict["visualize"] = visualize if additional_params is not None: config_dict["additional_params"] = additional_params if algorithm_config is not None: config_dict["algorithm_config"] = algorithm_config cfg = PaddleOCRConfig.from_dict(config_dict) if not cfg.api_url: raise RuntimeError("[PaddleOCR] API URL missing") # Prepare file data and generate page images for cropping data_bytes = self._prepare_file_data(filepath, binary) # Generate page images for cropping functionality input_source = filepath if binary is None else binary try: self.__images__(input_source, callback=callback) except Exception as e: self.logger.warning(f"[PaddleOCR] Failed to generate page images for cropping: {e}") # Build and send request result = self._send_request(data_bytes, cfg, callback) # Process response sections = self._transfer_to_sections(result, algorithm=cfg.algorithm, parse_method=parse_method) if callback: callback(0.9, f"[PaddleOCR] done, sections: {len(sections)}") tables = self._transfer_to_tables(result) if callback: callback(1.0, f"[PaddleOCR] done, tables: {len(tables)}") return sections, tables def _prepare_file_data(self, filepath: str | PathLike[str], binary: BytesIO | bytes | None) -> bytes: """Prepare file data for API request.""" source_path = Path(filepath) if binary is not None: if isinstance(binary, (bytes, bytearray)): return binary return binary.getbuffer().tobytes() if not source_path.exists(): raise FileNotFoundError(f"[PaddleOCR] file not found: {source_path}") return source_path.read_bytes() def _build_payload(self, data: bytes, file_type: int, config: PaddleOCRConfig) -> dict[str, Any]: """Build payload for API request.""" payload: dict[str, Any] = { "file": base64.b64encode(data).decode("ascii"), "fileType": file_type, } # Add common parameters for param_key, param_value in [ ("prettify_markdown", config.prettify_markdown), ("show_formula_number", config.show_formula_number), ("visualize", config.visualize), ]: if param_value is not None: api_param = self._COMMON_FIELD_MAPPING[param_key] payload[api_param] = param_value # Add algorithm-specific parameters algorithm_mapping = self._ALGORITHM_FIELD_MAPPINGS.get(config.algorithm, {}) for param_key, param_value in config.algorithm_config.items(): if param_value is not None and param_key in algorithm_mapping: api_param = algorithm_mapping[param_key] payload[api_param] = param_value # Add any additional parameters if config.additional_params: payload.update(config.additional_params) return payload def _send_request(self, data: bytes, config: PaddleOCRConfig, callback: Optional[Callable[[float, str], None]]) -> dict[str, Any]: """Send request to PaddleOCR API and parse response.""" # Build payload payload = self._build_payload(data, self.file_type, config) # Prepare headers headers = {"Content-Type": "application/json", "Client-Platform": "ragflow"} if config.access_token: headers["Authorization"] = f"token {config.access_token}" self.logger.info("[PaddleOCR] invoking API") if callback: callback(0.1, "[PaddleOCR] submitting request") # Send request try: resp = requests.post(config.api_url, json=payload, headers=headers, timeout=self.request_timeout) resp.raise_for_status() except Exception as exc: if callback: callback(-1, f"[PaddleOCR] request failed: {exc}") raise RuntimeError(f"[PaddleOCR] request failed: {exc}") # Parse response try: response_data = resp.json() except Exception as exc: raise RuntimeError(f"[PaddleOCR] response is not JSON: {exc}") from exc if callback: callback(0.8, "[PaddleOCR] response received") # Validate response format if response_data.get("errorCode") != 0 or not isinstance(response_data.get("result"), dict): if callback: callback(-1, "[PaddleOCR] invalid response format") raise RuntimeError("[PaddleOCR] invalid response format") return response_data["result"] def _transfer_to_sections(self, result: dict[str, Any], algorithm: AlgorithmType, parse_method: str) -> list[SectionTuple]: """Convert API response to section tuples.""" sections: list[SectionTuple] = [] if algorithm == "PaddleOCR-VL": layout_parsing_results = result.get("layoutParsingResults", []) for page_idx, layout_result in enumerate(layout_parsing_results): pruned_result = layout_result.get("prunedResult", {}) parsing_res_list = pruned_result.get("parsing_res_list", []) for block in parsing_res_list: block_content = block.get("block_content", "").strip() if not block_content: continue # Remove images block_content = _remove_images_from_markdown(block_content) label = block.get("block_label", "") block_bbox = block.get("block_bbox", [0, 0, 0, 0]) tag = f"@@{page_idx + 1}\t{block_bbox[0] // self._ZOOMIN}\t{block_bbox[2] // self._ZOOMIN}\t{block_bbox[1] // self._ZOOMIN}\t{block_bbox[3] // self._ZOOMIN}##" if parse_method == "manual": sections.append((block_content, label, tag)) elif parse_method == "paper": sections.append((block_content + tag, label)) else: sections.append((block_content, tag)) return sections def _transfer_to_tables(self, result: dict[str, Any]) -> list[TableTuple]: """Convert API response to table tuples.""" return [] def __images__(self, fnm, page_from=0, page_to=100, callback=None): """Generate page images from PDF for cropping.""" self.page_from = page_from self.page_to = page_to try: with pdfplumber.open(fnm) if isinstance(fnm, (str, PathLike)) else pdfplumber.open(BytesIO(fnm)) as pdf: self.pdf = pdf self.page_images = [p.to_image(resolution=72, antialias=True).original for i, p in enumerate(self.pdf.pages[page_from:page_to])] except Exception as e: self.page_images = None self.logger.exception(e) @staticmethod def extract_positions(txt: str): """Extract position information from text tags.""" poss = [] for tag in re.findall(r"@@[0-9-]+\t[0-9.\t]+##", txt): pn, left, right, top, bottom = tag.strip("#").strip("@").split("\t") left, right, top, bottom = float(left), float(right), float(top), float(bottom) poss.append(([int(p) - 1 for p in pn.split("-")], left, right, top, bottom)) return poss def crop(self, text: str, need_position: bool = False): """Crop images from PDF based on position tags in text.""" imgs = [] poss = self.extract_positions(text) if not poss: if need_position: return None, None return if not getattr(self, "page_images", None): self.logger.warning("[PaddleOCR] crop called without page images; skipping image generation.") if need_position: return None, None return page_count = len(self.page_images) filtered_poss = [] for pns, left, right, top, bottom in poss: if not pns: self.logger.warning("[PaddleOCR] Empty page index list in crop; skipping this position.") continue valid_pns = [p for p in pns if 0 <= p < page_count] if not valid_pns: self.logger.warning(f"[PaddleOCR] All page indices {pns} out of range for {page_count} pages; skipping.") continue filtered_poss.append((valid_pns, left, right, top, bottom)) poss = filtered_poss if not poss: self.logger.warning("[PaddleOCR] No valid positions after filtering; skip cropping.") if need_position: return None, None return max_width = max(np.max([right - left for (_, left, right, _, _) in poss]), 6) GAP = 6 pos = poss[0] first_page_idx = pos[0][0] poss.insert(0, ([first_page_idx], pos[1], pos[2], max(0, pos[3] - 120), max(pos[3] - GAP, 0))) pos = poss[-1] last_page_idx = pos[0][-1] if not (0 <= last_page_idx < page_count): self.logger.warning(f"[PaddleOCR] Last page index {last_page_idx} out of range for {page_count} pages; skipping crop.") if need_position: return None, None return last_page_height = self.page_images[last_page_idx].size[1] poss.append( ( [last_page_idx], pos[1], pos[2], min(last_page_height, pos[4] + GAP), min(last_page_height, pos[4] + 120), ) ) positions = [] for ii, (pns, left, right, top, bottom) in enumerate(poss): right = left + max_width if bottom <= top: bottom = top + 2 for pn in pns[1:]: if 0 <= pn - 1 < page_count: bottom += self.page_images[pn - 1].size[1] else: self.logger.warning(f"[PaddleOCR] Page index {pn}-1 out of range for {page_count} pages during crop; skipping height accumulation.") if not (0 <= pns[0] < page_count): self.logger.warning(f"[PaddleOCR] Base page index {pns[0]} out of range for {page_count} pages during crop; skipping this segment.") continue img0 = self.page_images[pns[0]] x0, y0, x1, y1 = int(left), int(top), int(right), int(min(bottom, img0.size[1])) crop0 = img0.crop((x0, y0, x1, y1)) imgs.append(crop0) if 0 < ii < len(poss) - 1: positions.append((pns[0] + self.page_from, x0, x1, y0, y1)) bottom -= img0.size[1] for pn in pns[1:]: if not (0 <= pn < page_count): self.logger.warning(f"[PaddleOCR] Page index {pn} out of range for {page_count} pages during crop; skipping this page.") continue page = self.page_images[pn] x0, y0, x1, y1 = int(left), 0, int(right), int(min(bottom, page.size[1])) cimgp = page.crop((x0, y0, x1, y1)) imgs.append(cimgp) if 0 < ii < len(poss) - 1: positions.append((pn + self.page_from, x0, x1, y0, y1)) bottom -= page.size[1] if not imgs: if need_position: return None, None return height = 0 for img in imgs: height += img.size[1] + GAP height = int(height) width = int(np.max([i.size[0] for i in imgs])) pic = Image.new("RGB", (width, height), (245, 245, 245)) height = 0 for ii, img in enumerate(imgs): if ii == 0 or ii + 1 == len(imgs): img = img.convert("RGBA") overlay = Image.new("RGBA", img.size, (0, 0, 0, 0)) overlay.putalpha(128) img = Image.alpha_composite(img, overlay).convert("RGB") pic.paste(img, (0, int(height))) height += img.size[1] + GAP if need_position: return pic, positions return pic if __name__ == "__main__": logging.basicConfig(level=logging.INFO) parser = PaddleOCRParser(api_url=os.getenv("PADDLEOCR_API_URL", ""), algorithm=os.getenv("PADDLEOCR_ALGORITHM", "PaddleOCR-VL")) ok, reason = parser.check_installation() print("PaddleOCR available:", ok, reason)