feat: add paddleocr parser (#12513)

### What problem does this PR solve? Add PaddleOCR as a new PDF parser. ### Type of change - [x] New Feature (non-breaking change which adds functionality)
2026-01-30 15:16:45 +08:00 · 2026-01-09 17:48:45 +08:00
parent 6abf55c048
commit 2e09db02f3
34 changed files with 1510 additions and 453 deletions
--- a/deepdoc/parser/paddleocr_parser.py
+++ b/deepdoc/parser/paddleocr_parser.py
@ -0,0 +1,400 @@
+#  Copyright 2026 The InfiniFlow Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+from __future__ import annotations
+
+import base64
+import logging
+import os
+import re
+from dataclasses import asdict, dataclass, field, fields
+from io import BytesIO
+from os import PathLike
+from pathlib import Path
+from typing import Any, Callable, ClassVar, Literal, Optional, Union, Tuple, List
+
+import requests
+
+try:
+    from deepdoc.parser.pdf_parser import RAGFlowPdfParser
+except Exception:
+
+    class RAGFlowPdfParser:
+        pass
+
+
+AlgorithmType = Literal["PaddleOCR-VL"]
+SectionTuple = tuple[str, ...]
+TableTuple = tuple[str, ...]
+ParseResult = tuple[list[SectionTuple], list[TableTuple]]
+
+
+_MARKDOWN_IMAGE_PATTERN = re.compile(
+    r"""
+        <div[^>]*>\s*
+        <img[^>]*/>\s*
+        </div>
+        |
+        <img[^>]*/>
+        """,
+    re.IGNORECASE | re.VERBOSE | re.DOTALL,
+)
+
+
+def _remove_images_from_markdown(markdown: str) -> str:
+    return _MARKDOWN_IMAGE_PATTERN.sub("", markdown)
+
+
+@dataclass
+class PaddleOCRVLConfig:
+    """Configuration for PaddleOCR-VL algorithm."""
+
+    use_doc_orientation_classify: Optional[bool] = None
+    use_doc_unwarping: Optional[bool] = None
+    use_layout_detection: Optional[bool] = None
+    use_polygon_points: Optional[bool] = None
+    use_chart_recognition: Optional[bool] = None
+    use_seal_recognition: Optional[bool] = None
+    use_ocr_for_image_block: Optional[bool] = None
+    layout_threshold: Optional[Union[float, dict]] = None
+    layout_nms: Optional[bool] = None
+    layout_unclip_ratio: Optional[Union[float, Tuple[float, float], dict]] = None
+    layout_merge_bboxes_mode: Optional[Union[str, dict]] = None
+    prompt_label: Optional[str] = None
+    format_block_content: Optional[bool] = True
+    repetition_penalty: Optional[float] = None
+    temperature: Optional[float] = None
+    top_p: Optional[float] = None
+    min_pixels: Optional[int] = None
+    max_pixels: Optional[int] = None
+    max_new_tokens: Optional[int] = None
+    merge_layout_blocks: Optional[bool] = None
+    markdown_ignore_labels: Optional[List[str]] = None
+    vlm_extra_args: Optional[dict] = None
+
+
+@dataclass
+class PaddleOCRConfig:
+    """Main configuration for PaddleOCR parser."""
+
+    api_url: str = ""
+    access_token: Optional[str] = None
+    algorithm: AlgorithmType = "PaddleOCR-VL"
+    request_timeout: int = 600
+    prettify_markdown: bool = True
+    show_formula_number: bool = True
+    visualize: bool = False
+    additional_params: dict[str, Any] = field(default_factory=dict)
+    algorithm_config: dict[str, Any] = field(default_factory=dict)
+
+    @classmethod
+    def from_dict(cls, config: Optional[dict[str, Any]]) -> "PaddleOCRConfig":
+        """Create configuration from dictionary."""
+        if not config:
+            return cls()
+
+        cfg = config.copy()
+        algorithm = cfg.get("algorithm", "PaddleOCR-VL")
+
+        # Validate algorithm
+        if algorithm not in ("PaddleOCR-VL",):
+            raise ValueError(f"Unsupported algorithm: {algorithm}")
+
+        # Extract algorithm-specific configuration
+        algorithm_config: dict[str, Any] = {}
+        if algorithm == "PaddleOCR-VL":
+            # Create default PaddleOCRVLConfig object and convert to dict
+            algorithm_config = asdict(PaddleOCRVLConfig())
+
+            # Apply user-provided VL config
+            vl_config = cfg.get("vl")
+            if isinstance(vl_config, dict):
+                algorithm_config.update({k: v for k, v in vl_config.items() if v is not None})
+
+        # Remove processed keys
+        cfg.pop("vl", None)
+
+        # Prepare initialization arguments
+        field_names = {field.name for field in fields(cls)}
+        init_kwargs: dict[str, Any] = {}
+
+        for field_name in field_names:
+            if field_name in cfg:
+                init_kwargs[field_name] = cfg[field_name]
+
+        init_kwargs["algorithm_config"] = algorithm_config
+
+        return cls(**init_kwargs)
+
+    @classmethod
+    def from_kwargs(cls, **kwargs: Any) -> "PaddleOCRConfig":
+        """Create configuration from keyword arguments."""
+        return cls.from_dict(kwargs)
+
+
+class PaddleOCRParser(RAGFlowPdfParser):
+    """Parser for PDF documents using PaddleOCR API."""
+
+    _COMMON_FIELD_MAPPING: ClassVar[dict[str, str]] = {
+        "prettify_markdown": "prettifyMarkdown",
+        "show_formula_number": "showFormulaNumber",
+        "visualize": "visualize",
+    }
+
+    _ALGORITHM_FIELD_MAPPINGS: ClassVar[dict[str, dict[str, str]]] = {
+        "PaddleOCR-VL": {
+            "use_doc_orientation_classify": "useDocOrientationClassify",
+            "use_doc_unwarping": "useDocUnwarping",
+            "use_layout_detection": "useLayoutDetection",
+            "use_polygon_points": "usePolygonPoints",
+            "use_chart_recognition": "useChartRecognition",
+            "use_seal_recognition": "useSealRecognition",
+            "use_ocr_for_image_block": "useOcrForImageBlock",
+            "layout_threshold": "layoutThreshold",
+            "layout_nms": "layoutNms",
+            "layout_unclip_ratio": "layoutUnclipRatio",
+            "layout_merge_bboxes_mode": "layoutMergeBboxesMode",
+            "prompt_label": "promptLabel",
+            "format_block_content": "formatBlockContent",
+            "repetition_penalty": "repetitionPenalty",
+            "temperature": "temperature",
+            "top_p": "topP",
+            "min_pixels": "minPixels",
+            "max_pixels": "maxPixels",
+            "max_new_tokens": "maxNewTokens",
+            "merge_layout_blocks": "mergeLayoutBlocks",
+            "markdown_ignore_labels": "markdownIgnoreLabels",
+            "vlm_extra_args": "vlmExtraArgs",
+        },
+    }
+
+    def __init__(
+        self,
+        api_url: Optional[str] = None,
+        access_token: Optional[str] = None,
+        algorithm: AlgorithmType = "PaddleOCR-VL",
+        *,
+        request_timeout: int = 600,
+    ):
+        """Initialize PaddleOCR parser."""
+        self.api_url = api_url.rstrip("/") if api_url else os.getenv("PADDLEOCR_API_URL", "")
+        self.access_token = access_token or os.getenv("PADDLEOCR_ACCESS_TOKEN")
+        self.algorithm = algorithm
+        self.request_timeout = request_timeout
+        self.logger = logging.getLogger(self.__class__.__name__)
+
+        # Force PDF file type
+        self.file_type = 0
+
+    # Public methods
+    def check_installation(self) -> tuple[bool, str]:
+        """Check if the parser is properly installed and configured."""
+        if not self.api_url:
+            return False, "[PaddleOCR] API URL not configured"
+
+        # TODO [@Bobholamovic]: Check URL availability and token validity
+
+        return True, ""
+
+    def parse_pdf(
+        self,
+        filepath: str | PathLike[str],
+        binary: BytesIO | bytes | None = None,
+        callback: Optional[Callable[[float, str], None]] = None,
+        *,
+        parse_method: str = "raw",
+        api_url: Optional[str] = None,
+        access_token: Optional[str] = None,
+        algorithm: Optional[AlgorithmType] = None,
+        request_timeout: Optional[int] = None,
+        prettify_markdown: Optional[bool] = None,
+        show_formula_number: Optional[bool] = None,
+        visualize: Optional[bool] = None,
+        additional_params: Optional[dict[str, Any]] = None,
+        vl_config: Optional[dict[str, Any]] = None,
+        **kwargs: Any,
+    ) -> ParseResult:
+        """Parse PDF document using PaddleOCR API."""
+        # Create configuration - pass all kwargs to capture VL config parameters
+        config_dict = {
+            "api_url": api_url if api_url is not None else self.api_url,
+            "access_token": access_token if access_token is not None else self.access_token,
+            "algorithm": algorithm if algorithm is not None else self.algorithm,
+            "request_timeout": request_timeout if request_timeout is not None else self.request_timeout,
+        }
+        if prettify_markdown is not None:
+            config_dict["prettify_markdown"] = prettify_markdown
+        if show_formula_number is not None:
+            config_dict["show_formula_number"] = show_formula_number
+        if visualize is not None:
+            config_dict["visualize"] = visualize
+        if additional_params is not None:
+            config_dict["additional_params"] = additional_params
+        if vl_config is not None:
+            config_dict["vl"] = vl_config
+
+        # Add any VL config parameters from kwargs
+        for key, value in kwargs.items():
+            if key in {field.name for field in fields(PaddleOCRVLConfig)}:
+                config_dict[key] = value
+
+        cfg = PaddleOCRConfig.from_dict(config_dict)
+
+        if not cfg.api_url:
+            raise RuntimeError("[PaddleOCR] API URL missing")
+
+        # Prepare file data
+        data_bytes = self._prepare_file_data(filepath, binary)
+
+        # Build and send request
+        result = self._send_request(data_bytes, cfg, callback)
+
+        # Process response
+        sections = self._transfer_to_sections(result, algorithm=cfg.algorithm, parse_method=parse_method)
+        if callback:
+            callback(0.9, f"[PaddleOCR] done, sections: {len(sections)}")
+
+        tables = self._transfer_to_tables(result)
+        if callback:
+            callback(1.0, f"[PaddleOCR] done, tables: {len(tables)}")
+
+        return sections, tables
+
+    def _prepare_file_data(self, filepath: str | PathLike[str], binary: BytesIO | bytes | None) -> bytes:
+        """Prepare file data for API request."""
+        source_path = Path(filepath)
+
+        if binary is not None:
+            if isinstance(binary, (bytes, bytearray)):
+                return binary
+            return binary.getbuffer().tobytes()
+
+        if not source_path.exists():
+            raise FileNotFoundError(f"[PaddleOCR] file not found: {source_path}")
+
+        return source_path.read_bytes()
+
+    def _build_payload(self, data: bytes, file_type: int, config: PaddleOCRConfig) -> dict[str, Any]:
+        """Build payload for API request."""
+        payload: dict[str, Any] = {
+            "file": base64.b64encode(data).decode("ascii"),
+            "fileType": file_type,
+        }
+
+        # Add common parameters
+        for param_key, param_value in [
+            ("prettify_markdown", config.prettify_markdown),
+            ("show_formula_number", config.show_formula_number),
+            ("visualize", config.visualize),
+        ]:
+            if param_value is not None:
+                api_param = self._COMMON_FIELD_MAPPING[param_key]
+                payload[api_param] = param_value
+
+        # Add algorithm-specific parameters
+        algorithm_mapping = self._ALGORITHM_FIELD_MAPPINGS.get(config.algorithm, {})
+        for param_key, param_value in config.algorithm_config.items():
+            if param_value is not None and param_key in algorithm_mapping:
+                api_param = algorithm_mapping[param_key]
+                payload[api_param] = param_value
+
+        # Add any additional parameters
+        if config.additional_params:
+            payload.update(config.additional_params)
+
+        return payload
+
+    def _send_request(self, data: bytes, config: PaddleOCRConfig, callback: Optional[Callable[[float, str], None]]) -> dict[str, Any]:
+        """Send request to PaddleOCR API and parse response."""
+        # Build payload
+        payload = self._build_payload(data, self.file_type, config)
+
+        # Prepare headers
+        headers = {"Content-Type": "application/json", "Client-Platform": "ragflow"}
+        if config.access_token:
+            headers["Authorization"] = f"token {config.access_token}"
+
+        self.logger.info("[PaddleOCR] invoking API")
+        if callback:
+            callback(0.1, "[PaddleOCR] submitting request")
+
+        # Send request
+        try:
+            resp = requests.post(config.api_url, json=payload, headers=headers, timeout=self.request_timeout)
+            resp.raise_for_status()
+        except Exception as exc:
+            if callback:
+                callback(-1, f"[PaddleOCR] request failed: {exc}")
+            raise RuntimeError(f"[PaddleOCR] request failed: {exc}")
+
+        # Parse response
+        try:
+            response_data = resp.json()
+        except Exception as exc:
+            raise RuntimeError(f"[PaddleOCR] response is not JSON: {exc}") from exc
+
+        if callback:
+            callback(0.8, "[PaddleOCR] response received")
+
+        # Validate response format
+        if response_data.get("errorCode") != 0 or not isinstance(response_data.get("result"), dict):
+            if callback:
+                callback(-1, "[PaddleOCR] invalid response format")
+            raise RuntimeError("[PaddleOCR] invalid response format")
+
+        return response_data["result"]
+
+    def _transfer_to_sections(self, result: dict[str, Any], algorithm: AlgorithmType, parse_method: str) -> list[SectionTuple]:
+        """Convert API response to section tuples."""
+        sections: list[SectionTuple] = []
+
+        if algorithm == "PaddleOCR-VL":
+            layout_parsing_results = result.get("layoutParsingResults", [])
+
+            for page_idx, layout_result in enumerate(layout_parsing_results):
+                pruned_result = layout_result.get("prunedResult", {})
+                parsing_res_list = pruned_result.get("parsing_res_list", [])
+
+                for block in parsing_res_list:
+                    block_content = block.get("block_content", "").strip()
+                    if not block_content:
+                        continue
+
+                    # Remove images
+                    block_content = _remove_images_from_markdown(block_content)
+
+                    label = block.get("block_label", "")
+                    block_bbox = block.get("block_bbox", [0, 0, 0, 0])
+
+                    tag = f"@@{page_idx + 1}\t{block_bbox[0]}\t{block_bbox[2]}\t{block_bbox[1]}\t{block_bbox[3]}##"
+
+                    if parse_method == "manual":
+                        sections.append((block_content, label, tag))
+                    elif parse_method == "paper":
+                        sections.append((block_content + tag, label))
+                    else:
+                        sections.append((block_content, tag))
+
+        return sections
+
+    def _transfer_to_tables(self, result: dict[str, Any]) -> list[TableTuple]:
+        """Convert API response to table tuples."""
+        return []
+
+
+if __name__ == "__main__":
+    logging.basicConfig(level=logging.INFO)
+    parser = PaddleOCRParser(api_url=os.getenv("PADDLEOCR_API_URL", ""), algorithm=os.getenv("PADDLEOCR_ALGORITHM", "PaddleOCR-VL"))
+    ok, reason = parser.check_installation()
+    print("PaddleOCR available:", ok, reason)