Feat: add Docling parser (#10759)

### What problem does this PR solve? issue: #3945 change: add Docling parser ### Type of change - [x] New Feature (non-breaking change which adds functionality)
2026-02-06 18:45:08 +08:00 · 2025-10-23 19:44:25 +08:00
parent de24e74b4c
commit 0ff2042fc1
5 changed files with 378 additions and 0 deletions
--- a/deepdoc/parser/docling_parser.py
+++ b/deepdoc/parser/docling_parser.py
@ -0,0 +1,344 @@
+#
+#  Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+from __future__ import annotations
+
+import logging
+import re
+from dataclasses import dataclass
+from enum import Enum
+from io import BytesIO
+from os import PathLike
+from pathlib import Path
+from typing import Any, Callable, Iterable, Optional
+
+import pdfplumber
+from PIL import Image
+
+try:
+    from docling.document_converter import DocumentConverter
+except Exception:
+    DocumentConverter = None  
+
+try:
+    from deepdoc.parser.pdf_parser import RAGFlowPdfParser
+except Exception:
+    class RAGFlowPdfParser:  
+        pass
+
+
+class DoclingContentType(str, Enum):
+    IMAGE = "image"
+    TABLE = "table"
+    TEXT = "text"
+    EQUATION = "equation"
+
+
+@dataclass
+class _BBox:
+    page_no: int  
+    x0: float
+    y0: float
+    x1: float
+    y1: float
+
+
+class DoclingParser(RAGFlowPdfParser):
+    def __init__(self):
+        self.logger = logging.getLogger(self.__class__.__name__)
+        self.page_images: list[Image.Image] = []
+        self.page_from = 0
+        self.page_to = 10_000
+
+    def check_installation(self) -> bool:
+        if DocumentConverter is None:
+            self.logger.warning("[Docling] 'docling' is not importable, please: pip install docling")
+            return False
+        try:
+            _ = DocumentConverter()
+            return True
+        except Exception as e:
+            self.logger.error(f"[Docling] init DocumentConverter failed: {e}")
+            return False
+
+    def __images__(self, fnm, zoomin: int = 1, page_from=0, page_to=600, callback=None):
+        self.page_from = page_from
+        self.page_to = page_to
+        try:
+            opener = pdfplumber.open(fnm) if isinstance(fnm, (str, PathLike)) else pdfplumber.open(BytesIO(fnm))
+            with opener as pdf:
+                pages = pdf.pages[page_from:page_to]
+                self.page_images = [p.to_image(resolution=72 * zoomin, antialias=True).original for p in pages]
+        except Exception as e:
+            self.page_images = []
+            self.logger.exception(e)
+
+    def _make_line_tag(self,bbox: _BBox) -> str:
+        if bbox is None:
+            return ""
+        x0,x1, top, bott = bbox.x0, bbox.x1, bbox.y0, bbox.y1
+        if hasattr(self, "page_images") and self.page_images and len(self.page_images) >= bbox.page_no:
+            _, page_height = self.page_images[bbox.page_no-1].size
+            top, bott = page_height-top ,page_height-bott
+        return "@@{}\t{:.1f}\t{:.1f}\t{:.1f}\t{:.1f}##".format(
+            bbox.page_no, x0,x1, top, bott
+        )
+
+    @staticmethod
+    def extract_positions(txt: str) -> list[tuple[list[int], float, float, float, float]]:
+        poss = []
+        for tag in re.findall(r"@@[0-9-]+\t[0-9.\t]+##", txt):
+            pn, left, right, top, bottom = tag.strip("#").strip("@").split("\t")
+            left, right, top, bottom = float(left), float(right), float(top), float(bottom)
+            poss.append(([int(p) - 1 for p in pn.split("-")], left, right, top, bottom))
+        return poss
+
+    def crop(self, text: str, ZM: int = 1, need_position: bool = False):
+        imgs = []
+        poss = self.extract_positions(text)
+        if not poss:
+            return (None, None) if need_position else None
+
+        GAP = 6
+        pos = poss[0]
+        poss.insert(0, ([pos[0][0]], pos[1], pos[2], max(0, pos[3] - 120), max(pos[3] - GAP, 0)))
+        pos = poss[-1]
+        poss.append(([pos[0][-1]], pos[1], pos[2], min(self.page_images[pos[0][-1]].size[1], pos[4] + GAP), min(self.page_images[pos[0][-1]].size[1], pos[4] + 120)))
+        positions = []
+        for ii, (pns, left, right, top, bottom) in enumerate(poss):
+            if bottom <= top:
+                bottom = top + 4
+            img0 = self.page_images[pns[0]]
+            x0, y0, x1, y1 = int(left), int(top), int(right), int(min(bottom, img0.size[1]))
+            
+            crop0 = img0.crop((x0, y0, x1, y1))
+            imgs.append(crop0)
+            if 0 < ii < len(poss)-1:
+                positions.append((pns[0] + self.page_from, x0, x1, y0, y1))
+            remain_bottom = bottom - img0.size[1]
+            for pn in pns[1:]:
+                if remain_bottom <= 0:
+                    break
+                page = self.page_images[pn]
+                x0, y0, x1, y1 = int(left), 0, int(right), int(min(remain_bottom, page.size[1]))
+                cimgp = page.crop((x0, y0, x1, y1))
+                imgs.append(cimgp)
+                if 0 < ii < len(poss) - 1:
+                    positions.append((pn + self.page_from, x0, x1, y0, y1))
+                remain_bottom -= page.size[1]
+
+        if not imgs:
+            return (None, None) if need_position else None
+
+        height = sum(i.size[1] + GAP for i in imgs)
+        width = max(i.size[0] for i in imgs)
+        pic = Image.new("RGB", (width, int(height)), (245, 245, 245))
+        h = 0
+        for ii, img in enumerate(imgs):
+            if ii == 0 or ii + 1 == len(imgs):
+                img = img.convert("RGBA")
+                overlay = Image.new("RGBA", img.size, (0, 0, 0, 0))
+                overlay.putalpha(128)
+                img = Image.alpha_composite(img, overlay).convert("RGB")
+            pic.paste(img, (0, int(h)))
+            h += img.size[1] + GAP
+
+        return (pic, positions) if need_position else pic
+
+    def _iter_doc_items(self, doc) -> Iterable[tuple[str, Any, Optional[_BBox]]]:
+        for t in getattr(doc, "texts", []):
+            parent=getattr(t, "parent", "")
+            ref=getattr(parent,"cref","")
+            label=getattr(t, "label", "")
+            if (label in ("section_header","text",) and ref in ("#/body",)) or label in ("list_item",):
+                text = getattr(t, "text", "") or ""
+                bbox = None
+                if getattr(t, "prov", None):
+                    pn = getattr(t.prov[0], "page_no", None)
+                    bb = getattr(t.prov[0], "bbox", None)
+                    bb = [getattr(bb, "l", None),getattr(bb, "t", None),getattr(bb, "r", None),getattr(bb, "b", None)]
+                    if pn and bb and len(bb) == 4:
+                        bbox = _BBox(page_no=int(pn), x0=bb[0], y0=bb[1], x1=bb[2], y1=bb[3])
+                yield (DoclingContentType.TEXT.value, text, bbox)
+
+        for item in getattr(doc, "texts", []):
+            if getattr(item, "label", "") in ("FORMULA",):
+                text = getattr(item, "text", "") or ""
+                bbox = None
+                if getattr(item, "prov", None):
+                    pn = getattr(item.prov, "page_no", None)
+                    bb = getattr(item.prov, "bbox", None)
+                    bb = [getattr(bb, "l", None),getattr(bb, "t", None),getattr(bb, "r", None),getattr(bb, "b", None)]
+                    if pn and bb and len(bb) == 4:
+                        bbox = _BBox(int(pn), bb[0], bb[1], bb[2], bb[3])
+                yield (DoclingContentType.EQUATION.value, text, bbox)
+
+    def _transfer_to_sections(self, doc) -> list[tuple[str, str]]:
+        """
+        和 MinerUParser 保持一致：返回 [(section_text, line_tag), ...]
+        """
+        sections: list[tuple[str, str]] = []
+        for typ, payload, bbox in self._iter_doc_items(doc):
+            if typ == DoclingContentType.TEXT.value:
+                section = payload.strip()
+                if not section:
+                    continue
+            elif typ == DoclingContentType.EQUATION.value:
+                section = payload.strip()
+            else:
+                continue
+            
+            tag = self._make_line_tag(bbox) if isinstance(bbox,_BBox) else ""
+            sections.append((section, tag))
+        return sections
+
+    def cropout_docling_table(self, page_no: int, bbox: tuple[float, float, float, float], zoomin: int = 1):
+        if not getattr(self, "page_images", None):
+            return None, ""
+
+        idx = (page_no - 1) - getattr(self, "page_from", 0)
+        if idx < 0 or idx >= len(self.page_images):
+            return None, ""
+
+        page_img = self.page_images[idx]
+        W, H = page_img.size
+        left, top, right, bott = bbox
+
+        x0 = float(left)
+        y0 = float(H-top)
+        x1 = float(right)
+        y1 = float(H-bott)
+
+        x0, y0 = max(0.0, min(x0, W - 1)), max(0.0, min(y0, H - 1))
+        x1, y1 = max(x0 + 1.0, min(x1, W)), max(y0 + 1.0, min(y1, H))
+
+        try:
+            crop = page_img.crop((int(x0), int(y0), int(x1), int(y1))).convert("RGB")
+        except Exception:
+            return None, ""
+
+        pos = (page_no-1 if page_no>0 else 0, x0, x1, y0, y1)
+        return crop, [pos]
+
+    def _transfer_to_tables(self, doc):
+        tables = []
+        for tab in getattr(doc, "tables", []):
+            img = None
+            positions = ""
+            if getattr(tab, "prov", None):
+                pn = getattr(tab.prov[0], "page_no", None)
+                bb = getattr(tab.prov[0], "bbox", None)
+                if pn is not None and bb is not None:
+                    left = getattr(bb, "l", None)
+                    top = getattr(bb, "t", None)
+                    right = getattr(bb, "r", None)
+                    bott = getattr(bb, "b", None)
+                    if None not in (left, top, right, bott):
+                        img, positions = self.cropout_docling_table(int(pn), (float(left), float(top), float(right), float(bott)))
+            html = ""
+            try:
+                html = tab.export_to_html(doc=doc)
+            except Exception:
+                pass
+            tables.append(((img, html), positions if positions else ""))
+        for pic in getattr(doc, "pictures", []):
+            img = None
+            positions = ""
+            if getattr(pic, "prov", None):
+                pn = getattr(pic.prov[0], "page_no", None)
+                bb = getattr(pic.prov[0], "bbox", None)
+                if pn is not None and bb is not None:
+                    left = getattr(bb, "l", None)
+                    top = getattr(bb, "t", None)
+                    right = getattr(bb, "r", None)
+                    bott = getattr(bb, "b", None)
+                    if None not in (left, top, right, bott):
+                        img, positions = self.cropout_docling_table(int(pn), (float(left), float(top), float(right), float(bott)))
+            captions = ""
+            try:
+                captions = pic.caption_text(doc=doc)
+            except Exception:
+                pass
+            tables.append(((img, [captions]), positions if positions else ""))
+        return tables
+
+    def parse_pdf(
+        self,
+        filepath: str | PathLike[str],
+        binary: BytesIO | bytes | None = None,
+        callback: Optional[Callable] = None,
+        *,
+        output_dir: Optional[str] = None, 
+        lang: Optional[str] = None,        
+        method: str = "auto",             
+        delete_output: bool = True,       
+    ):
+
+        if not self.check_installation():
+            raise RuntimeError("Docling not available, please install `docling`")
+
+        if binary is not None:
+            tmpdir = Path(output_dir) if output_dir else Path.cwd() / ".docling_tmp"
+            tmpdir.mkdir(parents=True, exist_ok=True)
+            name = Path(filepath).name or "input.pdf"
+            tmp_pdf = tmpdir / name
+            with open(tmp_pdf, "wb") as f:
+                if isinstance(binary, (bytes, bytearray)):
+                    f.write(binary)
+                else:
+                    f.write(binary.getbuffer())
+            src_path = tmp_pdf
+        else:
+            src_path = Path(filepath)
+            if not src_path.exists():
+                raise FileNotFoundError(f"PDF not found: {src_path}")
+
+        if callback:
+            callback(0.1, f"[Docling] Converting: {src_path}")
+
+        try:
+            self.__images__(str(src_path), zoomin=1)
+        except Exception as e:
+            self.logger.warning(f"[Docling] render pages failed: {e}")
+
+        conv = DocumentConverter()  
+        conv_res = conv.convert(str(src_path))
+        doc = conv_res.document
+        if callback:
+            callback(0.7, f"[Docling] Parsed doc: {getattr(doc, 'num_pages', 'n/a')} pages")
+
+        sections = self._transfer_to_sections(doc)
+        tables = self._transfer_to_tables(doc)
+
+        if callback:
+            callback(0.95, f"[Docling] Sections: {len(sections)}, Tables: {len(tables)}")
+
+        if binary is not None and delete_output:
+            try:
+                Path(src_path).unlink(missing_ok=True)
+            except Exception:
+                pass
+
+        if callback:
+            callback(1.0, "[Docling] Done.")
+        return sections, tables
+
+
+if __name__ == "__main__":
+    logging.basicConfig(level=logging.INFO)
+    parser = DoclingParser()
+    print("Docling available:", parser.check_installation())
+    sections, tables = parser.parse_pdf(filepath="test_docling/toc.pdf", binary=None)
+    print(len(sections), len(tables))
--- a/docker/.env
+++ b/docker/.env
@ -195,3 +195,4 @@ REGISTER_ENABLED=1
 # COMPOSE_PROFILES=infinity,sandbox
 #    - For OpenSearch:
 # COMPOSE_PROFILES=opensearch,sandbox
+USE_DOCLING=false
--- a/docker/entrypoint.sh
+++ b/docker/entrypoint.sh
@ -178,6 +178,16 @@ function start_mcp_server() {
        "${MCP_JSON_RESPONSE_FLAG}" &
 }

+function ensure_docling() {
+  if [[ "${USE_DOCLING}" == "true" ]]; then
+    if ! python3 -c "import importlib.util,sys; sys.exit(0 if importlib.util.find_spec('docling') else 1)"; then
+      echo "[docling] not found, installing..."
+      python3 -m pip install --no-cache-dir "docling${DOCLING_VERSION:-}"
+    else
+      echo "[docling] already installed, skip."
+    fi
+  fi
+}
 # -----------------------------------------------------------------------------
 # Start components based on flags
 # -----------------------------------------------------------------------------
@ -203,6 +213,8 @@ if [[ "${ENABLE_MCP_SERVER}" -eq 1 ]]; then
    start_mcp_server
 fi

+ensure_docling
+
 if [[ "${ENABLE_TASKEXECUTOR}" -eq 1 ]]; then
    if [[ "${CONSUMER_NO_END}" -gt "${CONSUMER_NO_BEG}" ]]; then
        echo "Starting task executors on host '${HOST_ID}' for IDs in [${CONSUMER_NO_BEG}, ${CONSUMER_NO_END})..."
--- a/rag/app/naive.py
+++ b/rag/app/naive.py
@ -35,6 +35,7 @@ from deepdoc.parser import DocxParser, ExcelParser, HtmlParser, JsonParser, Mark
 from deepdoc.parser.figure_parser import VisionFigureParser,vision_figure_parser_docx_wrapper,vision_figure_parser_pdf_wrapper
 from deepdoc.parser.pdf_parser import PlainParser, VisionParser
 from deepdoc.parser.mineru_parser import MinerUParser
+from deepdoc.parser.docling_parser import DoclingParser
 from rag.nlp import concat_img, find_codec, naive_merge, naive_merge_with_images, naive_merge_docx, rag_tokenizer, tokenize_chunks, tokenize_chunks_with_images, tokenize_table


@ -532,6 +533,24 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
            )
            parser_config["chunk_token_num"] = 0
            callback(0.8, "Finish parsing.")
+
+        elif layout_recognizer == "Docling":
+            pdf_parser = DoclingParser()
+            if not pdf_parser.check_installation():
+                callback(-1, "Docling not found.")
+                return res
+
+            sections, tables = pdf_parser.parse_pdf(
+                filepath=filename,
+                binary=binary,
+                callback=callback,
+                output_dir=os.environ.get("MINERU_OUTPUT_DIR", ""),
+                delete_output=bool(int(os.environ.get("MINERU_DELETE_OUTPUT", 1))),
+            )
+            parser_config["chunk_token_num"] = 0
+            res = tokenize_table(tables, doc, is_english)
+            callback(0.8, "Finish parsing.")
+        
        else:
            if layout_recognizer == "Plain Text":
                pdf_parser = PlainParser()
--- a/web/src/components/layout-recognize-form-field.tsx
+++ b/web/src/components/layout-recognize-form-field.tsx
@ -18,6 +18,7 @@ export const enum ParseDocumentType {
  DeepDOC = 'DeepDOC',
  PlainText = 'Plain Text',
  MinerU = 'MinerU',
+  Docling = 'Docling',
 }

 export function LayoutRecognizeFormField({
@ -43,6 +44,7 @@ export function LayoutRecognizeFormField({
          ParseDocumentType.DeepDOC,
          ParseDocumentType.PlainText,
          ParseDocumentType.MinerU,
+          ParseDocumentType.Docling,
        ].map((x) => ({
          label: x === ParseDocumentType.PlainText ? t(camelCase(x)) : x,
          value: x,