From 27a36344d4ca03ab117cd8c6022ebd80ee74b3c0 Mon Sep 17 00:00:00 2001 From: Lin Manhui Date: Tue, 27 Jan 2026 09:49:46 +0800 Subject: [PATCH] Feat: Support PaddleOCR-VL-1.5 interface (#12819) ### What problem does this PR solve? This PR adds support to PaddleOCR-VL-1.5 interface to the PaddleOCR PDF Parser. ### Type of change - [x] New Feature (non-breaking change which adds functionality) --- deepdoc/parser/paddleocr_parser.py | 16 +++++++---- docs/faq.mdx | 28 +++++++++---------- .../paddleocr-options-form-field.tsx | 2 +- 3 files changed, 26 insertions(+), 20 deletions(-) diff --git a/deepdoc/parser/paddleocr_parser.py b/deepdoc/parser/paddleocr_parser.py index f6611e0c4..85db63b86 100644 --- a/deepdoc/parser/paddleocr_parser.py +++ b/deepdoc/parser/paddleocr_parser.py @@ -63,10 +63,10 @@ def _remove_images_from_markdown(markdown: str) -> str: class PaddleOCRVLConfig: """Configuration for PaddleOCR-VL algorithm.""" + use_doc_orientation_classify: Optional[bool] = False use_doc_orientation_classify: Optional[bool] = False use_doc_unwarping: Optional[bool] = False use_layout_detection: Optional[bool] = None - use_polygon_points: Optional[bool] = None use_chart_recognition: Optional[bool] = None use_seal_recognition: Optional[bool] = None use_ocr_for_image_block: Optional[bool] = None @@ -74,6 +74,7 @@ class PaddleOCRVLConfig: layout_nms: Optional[bool] = None layout_unclip_ratio: Optional[Union[float, Tuple[float, float], dict]] = None layout_merge_bboxes_mode: Optional[Union[str, dict]] = None + layout_shape_mode: Optional[str] = None prompt_label: Optional[str] = None format_block_content: Optional[bool] = True repetition_penalty: Optional[float] = None @@ -85,6 +86,9 @@ class PaddleOCRVLConfig: merge_layout_blocks: Optional[bool] = False markdown_ignore_labels: Optional[List[str]] = None vlm_extra_args: Optional[dict] = None + restructure_pages: Optional[bool] = False + merge_tables: Optional[bool] = None + relevel_titles: Optional[bool] = None @dataclass @@ -111,13 +115,12 @@ class PaddleOCRConfig: algorithm = cfg.get("algorithm", "PaddleOCR-VL") # Validate algorithm - if algorithm not in ("PaddleOCR-VL",): + if algorithm not in ("PaddleOCR-VL"): raise ValueError(f"Unsupported algorithm: {algorithm}") # Extract algorithm-specific configuration algorithm_config: dict[str, Any] = {} if algorithm == "PaddleOCR-VL": - # Create default PaddleOCRVLConfig object and convert to dict algorithm_config = asdict(PaddleOCRVLConfig()) algorithm_config_user = cfg.get("algorithm_config") if isinstance(algorithm_config_user, dict): @@ -160,7 +163,6 @@ class PaddleOCRParser(RAGFlowPdfParser): "use_doc_orientation_classify": "useDocOrientationClassify", "use_doc_unwarping": "useDocUnwarping", "use_layout_detection": "useLayoutDetection", - "use_polygon_points": "usePolygonPoints", "use_chart_recognition": "useChartRecognition", "use_seal_recognition": "useSealRecognition", "use_ocr_for_image_block": "useOcrForImageBlock", @@ -168,6 +170,7 @@ class PaddleOCRParser(RAGFlowPdfParser): "layout_nms": "layoutNms", "layout_unclip_ratio": "layoutUnclipRatio", "layout_merge_bboxes_mode": "layoutMergeBboxesMode", + "layout_shape_mode": "layoutShapeMode", "prompt_label": "promptLabel", "format_block_content": "formatBlockContent", "repetition_penalty": "repetitionPenalty", @@ -179,6 +182,9 @@ class PaddleOCRParser(RAGFlowPdfParser): "merge_layout_blocks": "mergeLayoutBlocks", "markdown_ignore_labels": "markdownIgnoreLabels", "vlm_extra_args": "vlmExtraArgs", + "restructure_pages": "restructurePages", + "merge_tables": "mergeTables", + "relevel_titles": "relevelTitles", }, } @@ -370,7 +376,7 @@ class PaddleOCRParser(RAGFlowPdfParser): """Convert API response to section tuples.""" sections: list[SectionTuple] = [] - if algorithm == "PaddleOCR-VL": + if algorithm in ("PaddleOCR-VL",): layout_parsing_results = result.get("layoutParsingResults", []) for page_idx, layout_result in enumerate(layout_parsing_results): diff --git a/docs/faq.mdx b/docs/faq.mdx index d08bb9361..cc7ab374b 100644 --- a/docs/faq.mdx +++ b/docs/faq.mdx @@ -43,11 +43,11 @@ You can find the RAGFlow version number on the **System** page of the UI: If you build RAGFlow from source, the version number is also in the system log: ``` - ____ ___ ______ ______ __ + ____ ___ ______ ______ __ / __ \ / | / ____// ____// /____ _ __ / /_/ // /| | / / __ / /_ / // __ \| | /| / / - / _, _// ___ |/ /_/ // __/ / // /_/ /| |/ |/ / - /_/ |_|/_/ |_|\____//_/ /_/ \____/ |__/|__/ + / _, _// ___ |/ /_/ // __/ / // /_/ /| |/ |/ / + /_/ |_|/_/ |_|\____//_/ /_/ \____/ |__/|__/ 2025-02-18 10:10:43,835 INFO 1445658 RAGFlow version: v0.15.0-50-g6daae7f2 ``` @@ -177,7 +177,7 @@ To fix this issue, use https://hf-mirror.com instead: 3. Start up the server: ```bash - docker compose up -d + docker compose up -d ``` --- @@ -210,11 +210,11 @@ You will not log in to RAGFlow unless the server is fully initialized. Run `dock *The server is successfully initialized, if your system displays the following:* ``` - ____ ___ ______ ______ __ + ____ ___ ______ ______ __ / __ \ / | / ____// ____// /____ _ __ / /_/ // /| | / / __ / /_ / // __ \| | /| / / - / _, _// ___ |/ /_/ // __/ / // /_/ /| |/ |/ / - /_/ |_|/_/ |_|\____//_/ /_/ \____/ |__/|__/ + / _, _// ___ |/ /_/ // __/ / // /_/ /| |/ |/ / + /_/ |_|/_/ |_|\____//_/ /_/ \____/ |__/|__/ * Running on all addresses (0.0.0.0) * Running on http://127.0.0.1:9380 @@ -317,7 +317,7 @@ The status of a Docker container status does not necessarily reflect the status $ docker ps ``` - *The status of a healthy Elasticsearch component should look as follows:* + *The status of a healthy Elasticsearch component should look as follows:* ``` 91220e3285dd docker.elastic.co/elasticsearch/elasticsearch:8.11.3 "/bin/tini -- /usr/l…" 11 hours ago Up 11 hours (healthy) 9300/tcp, 0.0.0.0:9200->9200/tcp, :::9200->9200/tcp ragflow-es-01 @@ -370,7 +370,7 @@ Yes, we do. See the Python files under the **rag/app** folder. $ docker ps ``` - *The status of a healthy Elasticsearch component should look as follows:* + *The status of a healthy Elasticsearch component should look as follows:* ```bash cd29bcb254bc quay.io/minio/minio:RELEASE.2023-12-20T01-00-02Z "/usr/bin/docker-ent…" 2 weeks ago Up 11 hours 0.0.0.0:9001->9001/tcp, :::9001->9001/tcp, 0.0.0.0:9000->9000/tcp, :::9000->9000/tcp ragflow-minio @@ -453,7 +453,7 @@ See [Upgrade RAGFlow](./guides/upgrade_ragflow.mdx) for more information. To switch your document engine from Elasticsearch to [Infinity](https://github.com/infiniflow/infinity): -1. Stop all running containers: +1. Stop all running containers: ```bash $ docker compose -f docker/docker-compose.yml down -v @@ -463,7 +463,7 @@ To switch your document engine from Elasticsearch to [Infinity](https://github.c ::: 2. In **docker/.env**, set `DOC_ENGINE=${DOC_ENGINE:-infinity}` -3. Restart your Docker image: +3. Restart your Docker image: ```bash $ docker compose -f docker-compose.yml up -d @@ -508,12 +508,12 @@ From v0.22.0 onwards, RAGFlow includes MinerU (≥ 2.6.3) as an optional PDF pa - `"vlm-mlx-engine"` - `"vlm-vllm-async-engine"` - `"vlm-lmdeploy-engine"`. - - `MINERU_SERVER_URL`: (optional) The downstream vLLM HTTP server (e.g., `http://vllm-host:30000`). Applicable when `MINERU_BACKEND` is set to `"vlm-http-client"`. + - `MINERU_SERVER_URL`: (optional) The downstream vLLM HTTP server (e.g., `http://vllm-host:30000`). Applicable when `MINERU_BACKEND` is set to `"vlm-http-client"`. - `MINERU_OUTPUT_DIR`: (optional) The local directory for holding the outputs of the MinerU API service (zip/JSON) before ingestion. - `MINERU_DELETE_OUTPUT`: Whether to delete temporary output when a temporary directory is used: - `1`: Delete. - `0`: Retain. -3. In the web UI, navigate to your dataset's **Configuration** page and find the **Ingestion pipeline** section: +3. In the web UI, navigate to your dataset's **Configuration** page and find the **Ingestion pipeline** section: - If you decide to use a chunking method from the **Built-in** dropdown, ensure it supports PDF parsing, then select **MinerU** from the **PDF parser** dropdown. - If you use a custom ingestion pipeline instead, select **MinerU** in the **PDF parser** section of the **Parser** component. @@ -600,7 +600,7 @@ This method uses PaddleOCR's official API service with an access token. - If using custom ingestion pipeline, select **PaddleOCR** in the **Parser** component **Notes:** -- To obtain the API URL, visit the [PaddleOCR official website](https://aistudio.baidu.com/paddleocr/task), click the **API** button in the upper-left corner, choose the example code for the specific algorithm you want to use (e.g., PaddleOCR-VL), and copy the `API_URL`. +- To obtain the API URL, visit the [PaddleOCR official website](https://aistudio.baidu.com/paddleocr), click the **API** button, choose the example code for the specific algorithm you want to use (e.g., PaddleOCR-VL), and copy the `API_URL`. - Access tokens can be obtained from the [AI Studio platform](https://aistudio.baidu.com/account/accessToken). - This method requires internet connectivity to reach the official PaddleOCR API. diff --git a/web/src/components/paddleocr-options-form-field.tsx b/web/src/components/paddleocr-options-form-field.tsx index 0d70519eb..03adf5ee4 100644 --- a/web/src/components/paddleocr-options-form-field.tsx +++ b/web/src/components/paddleocr-options-form-field.tsx @@ -83,7 +83,7 @@ export function PaddleOCROptionsFormField({ > {(field) => (