From 0ff2042fc1d33ffa581d27798d8b8ee7948445de Mon Sep 17 00:00:00 2001 From: buua436 <66937541+buua436@users.noreply.github.com> Date: Thu, 23 Oct 2025 19:44:25 +0800 Subject: [PATCH] Feat: add Docling parser (#10759) ### What problem does this PR solve? issue: #3945 change: add Docling parser ### Type of change - [x] New Feature (non-breaking change which adds functionality) --- deepdoc/parser/docling_parser.py | 344 ++++++++++++++++++ docker/.env | 1 + docker/entrypoint.sh | 12 + rag/app/naive.py | 19 + .../layout-recognize-form-field.tsx | 2 + 5 files changed, 378 insertions(+) create mode 100644 deepdoc/parser/docling_parser.py diff --git a/deepdoc/parser/docling_parser.py b/deepdoc/parser/docling_parser.py new file mode 100644 index 000000000..dd0f57ea4 --- /dev/null +++ b/deepdoc/parser/docling_parser.py @@ -0,0 +1,344 @@ +# +# Copyright 2025 The InfiniFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +from __future__ import annotations + +import logging +import re +from dataclasses import dataclass +from enum import Enum +from io import BytesIO +from os import PathLike +from pathlib import Path +from typing import Any, Callable, Iterable, Optional + +import pdfplumber +from PIL import Image + +try: + from docling.document_converter import DocumentConverter +except Exception: + DocumentConverter = None + +try: + from deepdoc.parser.pdf_parser import RAGFlowPdfParser +except Exception: + class RAGFlowPdfParser: + pass + + +class DoclingContentType(str, Enum): + IMAGE = "image" + TABLE = "table" + TEXT = "text" + EQUATION = "equation" + + +@dataclass +class _BBox: + page_no: int + x0: float + y0: float + x1: float + y1: float + + +class DoclingParser(RAGFlowPdfParser): + def __init__(self): + self.logger = logging.getLogger(self.__class__.__name__) + self.page_images: list[Image.Image] = [] + self.page_from = 0 + self.page_to = 10_000 + + def check_installation(self) -> bool: + if DocumentConverter is None: + self.logger.warning("[Docling] 'docling' is not importable, please: pip install docling") + return False + try: + _ = DocumentConverter() + return True + except Exception as e: + self.logger.error(f"[Docling] init DocumentConverter failed: {e}") + return False + + def __images__(self, fnm, zoomin: int = 1, page_from=0, page_to=600, callback=None): + self.page_from = page_from + self.page_to = page_to + try: + opener = pdfplumber.open(fnm) if isinstance(fnm, (str, PathLike)) else pdfplumber.open(BytesIO(fnm)) + with opener as pdf: + pages = pdf.pages[page_from:page_to] + self.page_images = [p.to_image(resolution=72 * zoomin, antialias=True).original for p in pages] + except Exception as e: + self.page_images = [] + self.logger.exception(e) + + def _make_line_tag(self,bbox: _BBox) -> str: + if bbox is None: + return "" + x0,x1, top, bott = bbox.x0, bbox.x1, bbox.y0, bbox.y1 + if hasattr(self, "page_images") and self.page_images and len(self.page_images) >= bbox.page_no: + _, page_height = self.page_images[bbox.page_no-1].size + top, bott = page_height-top ,page_height-bott + return "@@{}\t{:.1f}\t{:.1f}\t{:.1f}\t{:.1f}##".format( + bbox.page_no, x0,x1, top, bott + ) + + @staticmethod + def extract_positions(txt: str) -> list[tuple[list[int], float, float, float, float]]: + poss = [] + for tag in re.findall(r"@@[0-9-]+\t[0-9.\t]+##", txt): + pn, left, right, top, bottom = tag.strip("#").strip("@").split("\t") + left, right, top, bottom = float(left), float(right), float(top), float(bottom) + poss.append(([int(p) - 1 for p in pn.split("-")], left, right, top, bottom)) + return poss + + def crop(self, text: str, ZM: int = 1, need_position: bool = False): + imgs = [] + poss = self.extract_positions(text) + if not poss: + return (None, None) if need_position else None + + GAP = 6 + pos = poss[0] + poss.insert(0, ([pos[0][0]], pos[1], pos[2], max(0, pos[3] - 120), max(pos[3] - GAP, 0))) + pos = poss[-1] + poss.append(([pos[0][-1]], pos[1], pos[2], min(self.page_images[pos[0][-1]].size[1], pos[4] + GAP), min(self.page_images[pos[0][-1]].size[1], pos[4] + 120))) + positions = [] + for ii, (pns, left, right, top, bottom) in enumerate(poss): + if bottom <= top: + bottom = top + 4 + img0 = self.page_images[pns[0]] + x0, y0, x1, y1 = int(left), int(top), int(right), int(min(bottom, img0.size[1])) + + crop0 = img0.crop((x0, y0, x1, y1)) + imgs.append(crop0) + if 0 < ii < len(poss)-1: + positions.append((pns[0] + self.page_from, x0, x1, y0, y1)) + remain_bottom = bottom - img0.size[1] + for pn in pns[1:]: + if remain_bottom <= 0: + break + page = self.page_images[pn] + x0, y0, x1, y1 = int(left), 0, int(right), int(min(remain_bottom, page.size[1])) + cimgp = page.crop((x0, y0, x1, y1)) + imgs.append(cimgp) + if 0 < ii < len(poss) - 1: + positions.append((pn + self.page_from, x0, x1, y0, y1)) + remain_bottom -= page.size[1] + + if not imgs: + return (None, None) if need_position else None + + height = sum(i.size[1] + GAP for i in imgs) + width = max(i.size[0] for i in imgs) + pic = Image.new("RGB", (width, int(height)), (245, 245, 245)) + h = 0 + for ii, img in enumerate(imgs): + if ii == 0 or ii + 1 == len(imgs): + img = img.convert("RGBA") + overlay = Image.new("RGBA", img.size, (0, 0, 0, 0)) + overlay.putalpha(128) + img = Image.alpha_composite(img, overlay).convert("RGB") + pic.paste(img, (0, int(h))) + h += img.size[1] + GAP + + return (pic, positions) if need_position else pic + + def _iter_doc_items(self, doc) -> Iterable[tuple[str, Any, Optional[_BBox]]]: + for t in getattr(doc, "texts", []): + parent=getattr(t, "parent", "") + ref=getattr(parent,"cref","") + label=getattr(t, "label", "") + if (label in ("section_header","text",) and ref in ("#/body",)) or label in ("list_item",): + text = getattr(t, "text", "") or "" + bbox = None + if getattr(t, "prov", None): + pn = getattr(t.prov[0], "page_no", None) + bb = getattr(t.prov[0], "bbox", None) + bb = [getattr(bb, "l", None),getattr(bb, "t", None),getattr(bb, "r", None),getattr(bb, "b", None)] + if pn and bb and len(bb) == 4: + bbox = _BBox(page_no=int(pn), x0=bb[0], y0=bb[1], x1=bb[2], y1=bb[3]) + yield (DoclingContentType.TEXT.value, text, bbox) + + for item in getattr(doc, "texts", []): + if getattr(item, "label", "") in ("FORMULA",): + text = getattr(item, "text", "") or "" + bbox = None + if getattr(item, "prov", None): + pn = getattr(item.prov, "page_no", None) + bb = getattr(item.prov, "bbox", None) + bb = [getattr(bb, "l", None),getattr(bb, "t", None),getattr(bb, "r", None),getattr(bb, "b", None)] + if pn and bb and len(bb) == 4: + bbox = _BBox(int(pn), bb[0], bb[1], bb[2], bb[3]) + yield (DoclingContentType.EQUATION.value, text, bbox) + + def _transfer_to_sections(self, doc) -> list[tuple[str, str]]: + """ + 和 MinerUParser 保持一致:返回 [(section_text, line_tag), ...] + """ + sections: list[tuple[str, str]] = [] + for typ, payload, bbox in self._iter_doc_items(doc): + if typ == DoclingContentType.TEXT.value: + section = payload.strip() + if not section: + continue + elif typ == DoclingContentType.EQUATION.value: + section = payload.strip() + else: + continue + + tag = self._make_line_tag(bbox) if isinstance(bbox,_BBox) else "" + sections.append((section, tag)) + return sections + + def cropout_docling_table(self, page_no: int, bbox: tuple[float, float, float, float], zoomin: int = 1): + if not getattr(self, "page_images", None): + return None, "" + + idx = (page_no - 1) - getattr(self, "page_from", 0) + if idx < 0 or idx >= len(self.page_images): + return None, "" + + page_img = self.page_images[idx] + W, H = page_img.size + left, top, right, bott = bbox + + x0 = float(left) + y0 = float(H-top) + x1 = float(right) + y1 = float(H-bott) + + x0, y0 = max(0.0, min(x0, W - 1)), max(0.0, min(y0, H - 1)) + x1, y1 = max(x0 + 1.0, min(x1, W)), max(y0 + 1.0, min(y1, H)) + + try: + crop = page_img.crop((int(x0), int(y0), int(x1), int(y1))).convert("RGB") + except Exception: + return None, "" + + pos = (page_no-1 if page_no>0 else 0, x0, x1, y0, y1) + return crop, [pos] + + def _transfer_to_tables(self, doc): + tables = [] + for tab in getattr(doc, "tables", []): + img = None + positions = "" + if getattr(tab, "prov", None): + pn = getattr(tab.prov[0], "page_no", None) + bb = getattr(tab.prov[0], "bbox", None) + if pn is not None and bb is not None: + left = getattr(bb, "l", None) + top = getattr(bb, "t", None) + right = getattr(bb, "r", None) + bott = getattr(bb, "b", None) + if None not in (left, top, right, bott): + img, positions = self.cropout_docling_table(int(pn), (float(left), float(top), float(right), float(bott))) + html = "" + try: + html = tab.export_to_html(doc=doc) + except Exception: + pass + tables.append(((img, html), positions if positions else "")) + for pic in getattr(doc, "pictures", []): + img = None + positions = "" + if getattr(pic, "prov", None): + pn = getattr(pic.prov[0], "page_no", None) + bb = getattr(pic.prov[0], "bbox", None) + if pn is not None and bb is not None: + left = getattr(bb, "l", None) + top = getattr(bb, "t", None) + right = getattr(bb, "r", None) + bott = getattr(bb, "b", None) + if None not in (left, top, right, bott): + img, positions = self.cropout_docling_table(int(pn), (float(left), float(top), float(right), float(bott))) + captions = "" + try: + captions = pic.caption_text(doc=doc) + except Exception: + pass + tables.append(((img, [captions]), positions if positions else "")) + return tables + + def parse_pdf( + self, + filepath: str | PathLike[str], + binary: BytesIO | bytes | None = None, + callback: Optional[Callable] = None, + *, + output_dir: Optional[str] = None, + lang: Optional[str] = None, + method: str = "auto", + delete_output: bool = True, + ): + + if not self.check_installation(): + raise RuntimeError("Docling not available, please install `docling`") + + if binary is not None: + tmpdir = Path(output_dir) if output_dir else Path.cwd() / ".docling_tmp" + tmpdir.mkdir(parents=True, exist_ok=True) + name = Path(filepath).name or "input.pdf" + tmp_pdf = tmpdir / name + with open(tmp_pdf, "wb") as f: + if isinstance(binary, (bytes, bytearray)): + f.write(binary) + else: + f.write(binary.getbuffer()) + src_path = tmp_pdf + else: + src_path = Path(filepath) + if not src_path.exists(): + raise FileNotFoundError(f"PDF not found: {src_path}") + + if callback: + callback(0.1, f"[Docling] Converting: {src_path}") + + try: + self.__images__(str(src_path), zoomin=1) + except Exception as e: + self.logger.warning(f"[Docling] render pages failed: {e}") + + conv = DocumentConverter() + conv_res = conv.convert(str(src_path)) + doc = conv_res.document + if callback: + callback(0.7, f"[Docling] Parsed doc: {getattr(doc, 'num_pages', 'n/a')} pages") + + sections = self._transfer_to_sections(doc) + tables = self._transfer_to_tables(doc) + + if callback: + callback(0.95, f"[Docling] Sections: {len(sections)}, Tables: {len(tables)}") + + if binary is not None and delete_output: + try: + Path(src_path).unlink(missing_ok=True) + except Exception: + pass + + if callback: + callback(1.0, "[Docling] Done.") + return sections, tables + + +if __name__ == "__main__": + logging.basicConfig(level=logging.INFO) + parser = DoclingParser() + print("Docling available:", parser.check_installation()) + sections, tables = parser.parse_pdf(filepath="test_docling/toc.pdf", binary=None) + print(len(sections), len(tables)) diff --git a/docker/.env b/docker/.env index b8384f445..c76f4f261 100644 --- a/docker/.env +++ b/docker/.env @@ -195,3 +195,4 @@ REGISTER_ENABLED=1 # COMPOSE_PROFILES=infinity,sandbox # - For OpenSearch: # COMPOSE_PROFILES=opensearch,sandbox +USE_DOCLING=false diff --git a/docker/entrypoint.sh b/docker/entrypoint.sh index 7f660d06d..2d9584a30 100755 --- a/docker/entrypoint.sh +++ b/docker/entrypoint.sh @@ -178,6 +178,16 @@ function start_mcp_server() { "${MCP_JSON_RESPONSE_FLAG}" & } +function ensure_docling() { + if [[ "${USE_DOCLING}" == "true" ]]; then + if ! python3 -c "import importlib.util,sys; sys.exit(0 if importlib.util.find_spec('docling') else 1)"; then + echo "[docling] not found, installing..." + python3 -m pip install --no-cache-dir "docling${DOCLING_VERSION:-}" + else + echo "[docling] already installed, skip." + fi + fi +} # ----------------------------------------------------------------------------- # Start components based on flags # ----------------------------------------------------------------------------- @@ -203,6 +213,8 @@ if [[ "${ENABLE_MCP_SERVER}" -eq 1 ]]; then start_mcp_server fi +ensure_docling + if [[ "${ENABLE_TASKEXECUTOR}" -eq 1 ]]; then if [[ "${CONSUMER_NO_END}" -gt "${CONSUMER_NO_BEG}" ]]; then echo "Starting task executors on host '${HOST_ID}' for IDs in [${CONSUMER_NO_BEG}, ${CONSUMER_NO_END})..." diff --git a/rag/app/naive.py b/rag/app/naive.py index a6c67b6b6..7497a04e1 100644 --- a/rag/app/naive.py +++ b/rag/app/naive.py @@ -35,6 +35,7 @@ from deepdoc.parser import DocxParser, ExcelParser, HtmlParser, JsonParser, Mark from deepdoc.parser.figure_parser import VisionFigureParser,vision_figure_parser_docx_wrapper,vision_figure_parser_pdf_wrapper from deepdoc.parser.pdf_parser import PlainParser, VisionParser from deepdoc.parser.mineru_parser import MinerUParser +from deepdoc.parser.docling_parser import DoclingParser from rag.nlp import concat_img, find_codec, naive_merge, naive_merge_with_images, naive_merge_docx, rag_tokenizer, tokenize_chunks, tokenize_chunks_with_images, tokenize_table @@ -532,6 +533,24 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, ) parser_config["chunk_token_num"] = 0 callback(0.8, "Finish parsing.") + + elif layout_recognizer == "Docling": + pdf_parser = DoclingParser() + if not pdf_parser.check_installation(): + callback(-1, "Docling not found.") + return res + + sections, tables = pdf_parser.parse_pdf( + filepath=filename, + binary=binary, + callback=callback, + output_dir=os.environ.get("MINERU_OUTPUT_DIR", ""), + delete_output=bool(int(os.environ.get("MINERU_DELETE_OUTPUT", 1))), + ) + parser_config["chunk_token_num"] = 0 + res = tokenize_table(tables, doc, is_english) + callback(0.8, "Finish parsing.") + else: if layout_recognizer == "Plain Text": pdf_parser = PlainParser() diff --git a/web/src/components/layout-recognize-form-field.tsx b/web/src/components/layout-recognize-form-field.tsx index 2a51a1bff..fb4935f33 100644 --- a/web/src/components/layout-recognize-form-field.tsx +++ b/web/src/components/layout-recognize-form-field.tsx @@ -18,6 +18,7 @@ export const enum ParseDocumentType { DeepDOC = 'DeepDOC', PlainText = 'Plain Text', MinerU = 'MinerU', + Docling = 'Docling', } export function LayoutRecognizeFormField({ @@ -43,6 +44,7 @@ export function LayoutRecognizeFormField({ ParseDocumentType.DeepDOC, ParseDocumentType.PlainText, ParseDocumentType.MinerU, + ParseDocumentType.Docling, ].map((x) => ({ label: x === ParseDocumentType.PlainText ? t(camelCase(x)) : x, value: x,