# Copyright 2026 The InfiniFlow Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # from __future__ import annotations import base64 import logging import os import re from dataclasses import asdict, dataclass, field, fields from io import BytesIO from os import PathLike from pathlib import Path from typing import Any, Callable, ClassVar, Literal, Optional, Union, Tuple, List import requests try: from deepdoc.parser.pdf_parser import RAGFlowPdfParser except Exception: class RAGFlowPdfParser: pass AlgorithmType = Literal["PaddleOCR-VL"] SectionTuple = tuple[str, ...] TableTuple = tuple[str, ...] ParseResult = tuple[list[SectionTuple], list[TableTuple]] _MARKDOWN_IMAGE_PATTERN = re.compile( r""" ]*>\s* ]*/>\s* | ]*/> """, re.IGNORECASE | re.VERBOSE | re.DOTALL, ) def _remove_images_from_markdown(markdown: str) -> str: return _MARKDOWN_IMAGE_PATTERN.sub("", markdown) @dataclass class PaddleOCRVLConfig: """Configuration for PaddleOCR-VL algorithm.""" use_doc_orientation_classify: Optional[bool] = None use_doc_unwarping: Optional[bool] = None use_layout_detection: Optional[bool] = None use_polygon_points: Optional[bool] = None use_chart_recognition: Optional[bool] = None use_seal_recognition: Optional[bool] = None use_ocr_for_image_block: Optional[bool] = None layout_threshold: Optional[Union[float, dict]] = None layout_nms: Optional[bool] = None layout_unclip_ratio: Optional[Union[float, Tuple[float, float], dict]] = None layout_merge_bboxes_mode: Optional[Union[str, dict]] = None prompt_label: Optional[str] = None format_block_content: Optional[bool] = True repetition_penalty: Optional[float] = None temperature: Optional[float] = None top_p: Optional[float] = None min_pixels: Optional[int] = None max_pixels: Optional[int] = None max_new_tokens: Optional[int] = None merge_layout_blocks: Optional[bool] = None markdown_ignore_labels: Optional[List[str]] = None vlm_extra_args: Optional[dict] = None @dataclass class PaddleOCRConfig: """Main configuration for PaddleOCR parser.""" api_url: str = "" access_token: Optional[str] = None algorithm: AlgorithmType = "PaddleOCR-VL" request_timeout: int = 600 prettify_markdown: bool = True show_formula_number: bool = True visualize: bool = False additional_params: dict[str, Any] = field(default_factory=dict) algorithm_config: dict[str, Any] = field(default_factory=dict) @classmethod def from_dict(cls, config: Optional[dict[str, Any]]) -> "PaddleOCRConfig": """Create configuration from dictionary.""" if not config: return cls() cfg = config.copy() algorithm = cfg.get("algorithm", "PaddleOCR-VL") # Validate algorithm if algorithm not in ("PaddleOCR-VL",): raise ValueError(f"Unsupported algorithm: {algorithm}") # Extract algorithm-specific configuration algorithm_config: dict[str, Any] = {} if algorithm == "PaddleOCR-VL": # Create default PaddleOCRVLConfig object and convert to dict algorithm_config = asdict(PaddleOCRVLConfig()) # Apply user-provided VL config vl_config = cfg.get("vl") if isinstance(vl_config, dict): algorithm_config.update({k: v for k, v in vl_config.items() if v is not None}) # Remove processed keys cfg.pop("vl", None) # Prepare initialization arguments field_names = {field.name for field in fields(cls)} init_kwargs: dict[str, Any] = {} for field_name in field_names: if field_name in cfg: init_kwargs[field_name] = cfg[field_name] init_kwargs["algorithm_config"] = algorithm_config return cls(**init_kwargs) @classmethod def from_kwargs(cls, **kwargs: Any) -> "PaddleOCRConfig": """Create configuration from keyword arguments.""" return cls.from_dict(kwargs) class PaddleOCRParser(RAGFlowPdfParser): """Parser for PDF documents using PaddleOCR API.""" _COMMON_FIELD_MAPPING: ClassVar[dict[str, str]] = { "prettify_markdown": "prettifyMarkdown", "show_formula_number": "showFormulaNumber", "visualize": "visualize", } _ALGORITHM_FIELD_MAPPINGS: ClassVar[dict[str, dict[str, str]]] = { "PaddleOCR-VL": { "use_doc_orientation_classify": "useDocOrientationClassify", "use_doc_unwarping": "useDocUnwarping", "use_layout_detection": "useLayoutDetection", "use_polygon_points": "usePolygonPoints", "use_chart_recognition": "useChartRecognition", "use_seal_recognition": "useSealRecognition", "use_ocr_for_image_block": "useOcrForImageBlock", "layout_threshold": "layoutThreshold", "layout_nms": "layoutNms", "layout_unclip_ratio": "layoutUnclipRatio", "layout_merge_bboxes_mode": "layoutMergeBboxesMode", "prompt_label": "promptLabel", "format_block_content": "formatBlockContent", "repetition_penalty": "repetitionPenalty", "temperature": "temperature", "top_p": "topP", "min_pixels": "minPixels", "max_pixels": "maxPixels", "max_new_tokens": "maxNewTokens", "merge_layout_blocks": "mergeLayoutBlocks", "markdown_ignore_labels": "markdownIgnoreLabels", "vlm_extra_args": "vlmExtraArgs", }, } def __init__( self, api_url: Optional[str] = None, access_token: Optional[str] = None, algorithm: AlgorithmType = "PaddleOCR-VL", *, request_timeout: int = 600, ): """Initialize PaddleOCR parser.""" self.api_url = api_url.rstrip("/") if api_url else os.getenv("PADDLEOCR_API_URL", "") self.access_token = access_token or os.getenv("PADDLEOCR_ACCESS_TOKEN") self.algorithm = algorithm self.request_timeout = request_timeout self.logger = logging.getLogger(self.__class__.__name__) # Force PDF file type self.file_type = 0 # Public methods def check_installation(self) -> tuple[bool, str]: """Check if the parser is properly installed and configured.""" if not self.api_url: return False, "[PaddleOCR] API URL not configured" # TODO [@Bobholamovic]: Check URL availability and token validity return True, "" def parse_pdf( self, filepath: str | PathLike[str], binary: BytesIO | bytes | None = None, callback: Optional[Callable[[float, str], None]] = None, *, parse_method: str = "raw", api_url: Optional[str] = None, access_token: Optional[str] = None, algorithm: Optional[AlgorithmType] = None, request_timeout: Optional[int] = None, prettify_markdown: Optional[bool] = None, show_formula_number: Optional[bool] = None, visualize: Optional[bool] = None, additional_params: Optional[dict[str, Any]] = None, vl_config: Optional[dict[str, Any]] = None, **kwargs: Any, ) -> ParseResult: """Parse PDF document using PaddleOCR API.""" # Create configuration - pass all kwargs to capture VL config parameters config_dict = { "api_url": api_url if api_url is not None else self.api_url, "access_token": access_token if access_token is not None else self.access_token, "algorithm": algorithm if algorithm is not None else self.algorithm, "request_timeout": request_timeout if request_timeout is not None else self.request_timeout, } if prettify_markdown is not None: config_dict["prettify_markdown"] = prettify_markdown if show_formula_number is not None: config_dict["show_formula_number"] = show_formula_number if visualize is not None: config_dict["visualize"] = visualize if additional_params is not None: config_dict["additional_params"] = additional_params if vl_config is not None: config_dict["vl"] = vl_config # Add any VL config parameters from kwargs for key, value in kwargs.items(): if key in {field.name for field in fields(PaddleOCRVLConfig)}: config_dict[key] = value cfg = PaddleOCRConfig.from_dict(config_dict) if not cfg.api_url: raise RuntimeError("[PaddleOCR] API URL missing") # Prepare file data data_bytes = self._prepare_file_data(filepath, binary) # Build and send request result = self._send_request(data_bytes, cfg, callback) # Process response sections = self._transfer_to_sections(result, algorithm=cfg.algorithm, parse_method=parse_method) if callback: callback(0.9, f"[PaddleOCR] done, sections: {len(sections)}") tables = self._transfer_to_tables(result) if callback: callback(1.0, f"[PaddleOCR] done, tables: {len(tables)}") return sections, tables def _prepare_file_data(self, filepath: str | PathLike[str], binary: BytesIO | bytes | None) -> bytes: """Prepare file data for API request.""" source_path = Path(filepath) if binary is not None: if isinstance(binary, (bytes, bytearray)): return binary return binary.getbuffer().tobytes() if not source_path.exists(): raise FileNotFoundError(f"[PaddleOCR] file not found: {source_path}") return source_path.read_bytes() def _build_payload(self, data: bytes, file_type: int, config: PaddleOCRConfig) -> dict[str, Any]: """Build payload for API request.""" payload: dict[str, Any] = { "file": base64.b64encode(data).decode("ascii"), "fileType": file_type, } # Add common parameters for param_key, param_value in [ ("prettify_markdown", config.prettify_markdown), ("show_formula_number", config.show_formula_number), ("visualize", config.visualize), ]: if param_value is not None: api_param = self._COMMON_FIELD_MAPPING[param_key] payload[api_param] = param_value # Add algorithm-specific parameters algorithm_mapping = self._ALGORITHM_FIELD_MAPPINGS.get(config.algorithm, {}) for param_key, param_value in config.algorithm_config.items(): if param_value is not None and param_key in algorithm_mapping: api_param = algorithm_mapping[param_key] payload[api_param] = param_value # Add any additional parameters if config.additional_params: payload.update(config.additional_params) return payload def _send_request(self, data: bytes, config: PaddleOCRConfig, callback: Optional[Callable[[float, str], None]]) -> dict[str, Any]: """Send request to PaddleOCR API and parse response.""" # Build payload payload = self._build_payload(data, self.file_type, config) # Prepare headers headers = {"Content-Type": "application/json", "Client-Platform": "ragflow"} if config.access_token: headers["Authorization"] = f"token {config.access_token}" self.logger.info("[PaddleOCR] invoking API") if callback: callback(0.1, "[PaddleOCR] submitting request") # Send request try: resp = requests.post(config.api_url, json=payload, headers=headers, timeout=self.request_timeout) resp.raise_for_status() except Exception as exc: if callback: callback(-1, f"[PaddleOCR] request failed: {exc}") raise RuntimeError(f"[PaddleOCR] request failed: {exc}") # Parse response try: response_data = resp.json() except Exception as exc: raise RuntimeError(f"[PaddleOCR] response is not JSON: {exc}") from exc if callback: callback(0.8, "[PaddleOCR] response received") # Validate response format if response_data.get("errorCode") != 0 or not isinstance(response_data.get("result"), dict): if callback: callback(-1, "[PaddleOCR] invalid response format") raise RuntimeError("[PaddleOCR] invalid response format") return response_data["result"] def _transfer_to_sections(self, result: dict[str, Any], algorithm: AlgorithmType, parse_method: str) -> list[SectionTuple]: """Convert API response to section tuples.""" sections: list[SectionTuple] = [] if algorithm == "PaddleOCR-VL": layout_parsing_results = result.get("layoutParsingResults", []) for page_idx, layout_result in enumerate(layout_parsing_results): pruned_result = layout_result.get("prunedResult", {}) parsing_res_list = pruned_result.get("parsing_res_list", []) for block in parsing_res_list: block_content = block.get("block_content", "").strip() if not block_content: continue # Remove images block_content = _remove_images_from_markdown(block_content) label = block.get("block_label", "") block_bbox = block.get("block_bbox", [0, 0, 0, 0]) tag = f"@@{page_idx + 1}\t{block_bbox[0]}\t{block_bbox[2]}\t{block_bbox[1]}\t{block_bbox[3]}##" if parse_method == "manual": sections.append((block_content, label, tag)) elif parse_method == "paper": sections.append((block_content + tag, label)) else: sections.append((block_content, tag)) return sections def _transfer_to_tables(self, result: dict[str, Any]) -> list[TableTuple]: """Convert API response to table tuples.""" return [] if __name__ == "__main__": logging.basicConfig(level=logging.INFO) parser = PaddleOCRParser(api_url=os.getenv("PADDLEOCR_API_URL", ""), algorithm=os.getenv("PADDLEOCR_ALGORITHM", "PaddleOCR-VL")) ok, reason = parser.check_installation() print("PaddleOCR available:", ok, reason)