feat: PaddleOCR PDF parser supports thumnails and positions (#12565)

### What problem does this PR solve? 1. PaddleOCR PDF parser supports thumnails and positions. 2. Add FAQ documentation for PaddleOCR PDF parser. ### Type of change - [x] New Feature (non-breaking change which adds functionality)
2026-01-30 15:16:45 +08:00 · 2026-01-13 09:51:08 +08:00
parent 44bada64c9
commit 4fe3c24198
4 changed files with 259 additions and 60 deletions
--- a/deepdoc/parser/paddleocr_parser.py
+++ b/deepdoc/parser/paddleocr_parser.py
@ -24,7 +24,10 @@ from os import PathLike
 from pathlib import Path
 from typing import Any, Callable, ClassVar, Literal, Optional, Union, Tuple, List

+import numpy as np
+import pdfplumber
 import requests
+from PIL import Image

 try:
    from deepdoc.parser.pdf_parser import RAGFlowPdfParser
@ -60,8 +63,8 @@ def _remove_images_from_markdown(markdown: str) -> str:
 class PaddleOCRVLConfig:
    """Configuration for PaddleOCR-VL algorithm."""

-    use_doc_orientation_classify: Optional[bool] = None
-    use_doc_unwarping: Optional[bool] = None
+    use_doc_orientation_classify: Optional[bool] = False
+    use_doc_unwarping: Optional[bool] = False
    use_layout_detection: Optional[bool] = None
    use_polygon_points: Optional[bool] = None
    use_chart_recognition: Optional[bool] = None
@ -79,7 +82,7 @@ class PaddleOCRVLConfig:
    min_pixels: Optional[int] = None
    max_pixels: Optional[int] = None
    max_new_tokens: Optional[int] = None
-    merge_layout_blocks: Optional[bool] = None
+    merge_layout_blocks: Optional[bool] = False
    markdown_ignore_labels: Optional[List[str]] = None
    vlm_extra_args: Optional[dict] = None

@ -116,14 +119,12 @@ class PaddleOCRConfig:
        if algorithm == "PaddleOCR-VL":
            # Create default PaddleOCRVLConfig object and convert to dict
            algorithm_config = asdict(PaddleOCRVLConfig())
-
-            # Apply user-provided VL config
-            vl_config = cfg.get("vl")
-            if isinstance(vl_config, dict):
-                algorithm_config.update({k: v for k, v in vl_config.items() if v is not None})
+        algorithm_config_user = cfg.get("algorithm_config")
+        if isinstance(algorithm_config_user, dict):
+            algorithm_config.update({k: v for k, v in algorithm_config_user.items() if v is not None})

        # Remove processed keys
-        cfg.pop("vl", None)
+        cfg.pop("algorithm_config", None)

        # Prepare initialization arguments
        field_names = {field.name for field in fields(cls)}
@ -146,6 +147,8 @@ class PaddleOCRConfig:
 class PaddleOCRParser(RAGFlowPdfParser):
    """Parser for PDF documents using PaddleOCR API."""

+    _ZOOMIN = 2
+
    _COMMON_FIELD_MAPPING: ClassVar[dict[str, str]] = {
        "prettify_markdown": "prettifyMarkdown",
        "show_formula_number": "showFormulaNumber",
@ -188,6 +191,8 @@ class PaddleOCRParser(RAGFlowPdfParser):
        request_timeout: int = 600,
    ):
        """Initialize PaddleOCR parser."""
+        super().__init__()
+
        self.api_url = api_url.rstrip("/") if api_url else os.getenv("PADDLEOCR_API_URL", "")
        self.access_token = access_token or os.getenv("PADDLEOCR_ACCESS_TOKEN")
        self.algorithm = algorithm
@ -197,6 +202,10 @@ class PaddleOCRParser(RAGFlowPdfParser):
        # Force PDF file type
        self.file_type = 0

+        # Initialize page images for cropping
+        self.page_images: list[Image.Image] = []
+        self.page_from = 0
+
    # Public methods
    def check_installation(self) -> tuple[bool, str]:
        """Check if the parser is properly installed and configured."""
@ -222,7 +231,7 @@ class PaddleOCRParser(RAGFlowPdfParser):
        show_formula_number: Optional[bool] = None,
        visualize: Optional[bool] = None,
        additional_params: Optional[dict[str, Any]] = None,
-        vl_config: Optional[dict[str, Any]] = None,
+        algorithm_config: Optional[dict[str, Any]] = None,
        **kwargs: Any,
    ) -> ParseResult:
        """Parse PDF document using PaddleOCR API."""
@ -241,22 +250,24 @@ class PaddleOCRParser(RAGFlowPdfParser):
            config_dict["visualize"] = visualize
        if additional_params is not None:
            config_dict["additional_params"] = additional_params
-        if vl_config is not None:
-            config_dict["vl"] = vl_config
-
-        # Add any VL config parameters from kwargs
-        for key, value in kwargs.items():
-            if key in {field.name for field in fields(PaddleOCRVLConfig)}:
-                config_dict[key] = value
+        if algorithm_config is not None:
+            config_dict["algorithm_config"] = algorithm_config

        cfg = PaddleOCRConfig.from_dict(config_dict)

        if not cfg.api_url:
            raise RuntimeError("[PaddleOCR] API URL missing")

-        # Prepare file data
+        # Prepare file data and generate page images for cropping
        data_bytes = self._prepare_file_data(filepath, binary)

+        # Generate page images for cropping functionality
+        input_source = filepath if binary is None else binary
+        try:
+            self.__images__(input_source, callback=callback)
+        except Exception as e:
+            self.logger.warning(f"[PaddleOCR] Failed to generate page images for cropping: {e}")
+
        # Build and send request
        result = self._send_request(data_bytes, cfg, callback)

@ -377,7 +388,7 @@ class PaddleOCRParser(RAGFlowPdfParser):
                    label = block.get("block_label", "")
                    block_bbox = block.get("block_bbox", [0, 0, 0, 0])

-                    tag = f"@@{page_idx + 1}\t{block_bbox[0]}\t{block_bbox[2]}\t{block_bbox[1]}\t{block_bbox[3]}##"
+                    tag = f"@@{page_idx + 1}\t{block_bbox[0] // self._ZOOMIN}\t{block_bbox[2] // self._ZOOMIN}\t{block_bbox[1] // self._ZOOMIN}\t{block_bbox[3] // self._ZOOMIN}##"

                    if parse_method == "manual":
                        sections.append((block_content, label, tag))
@ -392,6 +403,149 @@ class PaddleOCRParser(RAGFlowPdfParser):
        """Convert API response to table tuples."""
        return []

+    def __images__(self, fnm, page_from=0, page_to=100, callback=None):
+        """Generate page images from PDF for cropping."""
+        self.page_from = page_from
+        self.page_to = page_to
+        try:
+            with pdfplumber.open(fnm) if isinstance(fnm, (str, PathLike)) else pdfplumber.open(BytesIO(fnm)) as pdf:
+                self.pdf = pdf
+                self.page_images = [p.to_image(resolution=72, antialias=True).original for i, p in enumerate(self.pdf.pages[page_from:page_to])]
+        except Exception as e:
+            self.page_images = None
+            self.logger.exception(e)
+
+    @staticmethod
+    def extract_positions(txt: str):
+        """Extract position information from text tags."""
+        poss = []
+        for tag in re.findall(r"@@[0-9-]+\t[0-9.\t]+##", txt):
+            pn, left, right, top, bottom = tag.strip("#").strip("@").split("\t")
+            left, right, top, bottom = float(left), float(right), float(top), float(bottom)
+            poss.append(([int(p) - 1 for p in pn.split("-")], left, right, top, bottom))
+        return poss
+
+    def crop(self, text: str, need_position: bool = False):
+        """Crop images from PDF based on position tags in text."""
+        imgs = []
+        poss = self.extract_positions(text)
+
+        if not poss:
+            if need_position:
+                return None, None
+            return
+
+        if not getattr(self, "page_images", None):
+            self.logger.warning("[PaddleOCR] crop called without page images; skipping image generation.")
+            if need_position:
+                return None, None
+            return
+
+        page_count = len(self.page_images)
+
+        filtered_poss = []
+        for pns, left, right, top, bottom in poss:
+            if not pns:
+                self.logger.warning("[PaddleOCR] Empty page index list in crop; skipping this position.")
+                continue
+            valid_pns = [p for p in pns if 0 <= p < page_count]
+            if not valid_pns:
+                self.logger.warning(f"[PaddleOCR] All page indices {pns} out of range for {page_count} pages; skipping.")
+                continue
+            filtered_poss.append((valid_pns, left, right, top, bottom))
+
+        poss = filtered_poss
+        if not poss:
+            self.logger.warning("[PaddleOCR] No valid positions after filtering; skip cropping.")
+            if need_position:
+                return None, None
+            return
+
+        max_width = max(np.max([right - left for (_, left, right, _, _) in poss]), 6)
+        GAP = 6
+        pos = poss[0]
+        first_page_idx = pos[0][0]
+        poss.insert(0, ([first_page_idx], pos[1], pos[2], max(0, pos[3] - 120), max(pos[3] - GAP, 0)))
+        pos = poss[-1]
+        last_page_idx = pos[0][-1]
+        if not (0 <= last_page_idx < page_count):
+            self.logger.warning(f"[PaddleOCR] Last page index {last_page_idx} out of range for {page_count} pages; skipping crop.")
+            if need_position:
+                return None, None
+            return
+        last_page_height = self.page_images[last_page_idx].size[1]
+        poss.append(
+            (
+                [last_page_idx],
+                pos[1],
+                pos[2],
+                min(last_page_height, pos[4] + GAP),
+                min(last_page_height, pos[4] + 120),
+            )
+        )
+
+        positions = []
+        for ii, (pns, left, right, top, bottom) in enumerate(poss):
+            right = left + max_width
+
+            if bottom <= top:
+                bottom = top + 2
+
+            for pn in pns[1:]:
+                if 0 <= pn - 1 < page_count:
+                    bottom += self.page_images[pn - 1].size[1]
+                else:
+                    self.logger.warning(f"[PaddleOCR] Page index {pn}-1 out of range for {page_count} pages during crop; skipping height accumulation.")
+
+            if not (0 <= pns[0] < page_count):
+                self.logger.warning(f"[PaddleOCR] Base page index {pns[0]} out of range for {page_count} pages during crop; skipping this segment.")
+                continue
+
+            img0 = self.page_images[pns[0]]
+            x0, y0, x1, y1 = int(left), int(top), int(right), int(min(bottom, img0.size[1]))
+            crop0 = img0.crop((x0, y0, x1, y1))
+            imgs.append(crop0)
+            if 0 < ii < len(poss) - 1:
+                positions.append((pns[0] + self.page_from, x0, x1, y0, y1))
+
+            bottom -= img0.size[1]
+            for pn in pns[1:]:
+                if not (0 <= pn < page_count):
+                    self.logger.warning(f"[PaddleOCR] Page index {pn} out of range for {page_count} pages during crop; skipping this page.")
+                    continue
+                page = self.page_images[pn]
+                x0, y0, x1, y1 = int(left), 0, int(right), int(min(bottom, page.size[1]))
+                cimgp = page.crop((x0, y0, x1, y1))
+                imgs.append(cimgp)
+                if 0 < ii < len(poss) - 1:
+                    positions.append((pn + self.page_from, x0, x1, y0, y1))
+                bottom -= page.size[1]
+
+        if not imgs:
+            if need_position:
+                return None, None
+            return
+
+        height = 0
+        for img in imgs:
+            height += img.size[1] + GAP
+        height = int(height)
+        width = int(np.max([i.size[0] for i in imgs]))
+        pic = Image.new("RGB", (width, height), (245, 245, 245))
+        height = 0
+        for ii, img in enumerate(imgs):
+            if ii == 0 or ii + 1 == len(imgs):
+                img = img.convert("RGBA")
+                overlay = Image.new("RGBA", img.size, (0, 0, 0, 0))
+                overlay.putalpha(128)
+                img = Image.alpha_composite(img, overlay).convert("RGB")
+            pic.paste(img, (0, int(height)))
+            height += img.size[1] + GAP
+
+        if need_position:
+            return pic, positions
+        return pic
+

 if __name__ == "__main__":
    logging.basicConfig(level=logging.INFO)