Feat: add VLM-boosted PDF parser (#6278)

### What problem does this PR solve? Add VLM-boosted PDF parser if VLM is set. ### Type of change - [x] New Feature (non-breaking change which adds functionality)
2026-01-30 07:06:39 +08:00 · 2025-03-20 09:39:32 +08:00
parent 344727f9ba
commit 1d6760dd84
5 changed files with 181 additions and 33 deletions
--- a/deepdoc/parser/figure_parser.py
+++ b/deepdoc/parser/figure_parser.py
@ -0,0 +1,82 @@
+#
+#  Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+
+from rag.app.picture import vision_llm_chunk as picture_vision_llm_chunk
+from rag.prompts import vision_llm_figure_describe_prompt
+
+
+class VisionFigureParser:
+    def __init__(self, vision_model, figures_data, *args, **kwargs):
+        self.vision_model = vision_model
+        self._extract_figures_info(figures_data)
+        assert len(self.figures) == len(self.descriptions)
+        assert not self.positions or (len(self.figures) == len(self.positions))
+
+    def _extract_figures_info(self, figures_data):
+        self.figures = []
+        self.descriptions = []
+        self.positions = []
+
+        for item in figures_data:
+            # position
+            if len(item) == 2 and isinstance(item[1], list) and len(item[1]) == 1 and len(item[1][0]) == 5:
+                img_desc = item[0]
+                assert len(img_desc) == 2, "Should be (figure, [description])"
+                self.figures.append(img_desc[0])
+                self.descriptions.append(img_desc[1])
+                self.positions.append(item[1])
+            else:
+                assert len(item) == 2 and isinstance(item, tuple), f"get {len(item)=}, {item=}"
+                self.figures.append(item[0])
+                self.descriptions.append(item[1])
+
+    def _assemble(self):
+        self.assembled = []
+        self.has_positions = len(self.positions) != 0
+        for i in range(len(self.figures)):
+            figure = self.figures[i]
+            desc = self.descriptions[i]
+            pos = self.positions[i] if self.has_positions else None
+
+            figure_desc = (figure, desc)
+
+            if pos is not None:
+                self.assembled.append((figure_desc, pos))
+            else:
+                self.assembled.append((figure_desc,))
+
+        return self.assembled
+
+    def __call__(self, **kwargs):
+        callback = kwargs.get("callback", lambda prog, msg: None)
+
+        for idx, img_binary in enumerate(self.figures or []):
+            figure_num = idx  # 0-based
+
+            txt = picture_vision_llm_chunk(
+                binary=img_binary,
+                vision_model=self.vision_model,
+                prompt=vision_llm_figure_describe_prompt(),
+                callback=callback,
+            )
+
+            if txt:
+                self.descriptions[figure_num] = txt + "\n".join(self.descriptions[figure_num])
+
+        self._assemble()
+
+        return self.assembled
--- a/deepdoc/parser/pdf_parser.py
+++ b/deepdoc/parser/pdf_parser.py
@ -653,8 +653,7 @@ class RAGFlowPdfParser:
            b_["top"] = b["top"]
            self.boxes.pop(i)

-    def _extract_table_figure(self, need_image, ZM,
-                              return_html, need_position):
+    def _extract_table_figure(self, need_image, ZM, return_html, need_position, separate_tables_figures=False):
        tables = {}
        figures = {}
        # extract figure and table boxes
@ -768,9 +767,6 @@ class RAGFlowPdfParser:
                    tk)
            self.boxes.pop(i)

-        res = []
-        positions = []
-
        def cropout(bxs, ltype, poss):
            nonlocal ZM
            pn = set([b["page_number"] - 1 for b in bxs])
@ -818,6 +814,10 @@ class RAGFlowPdfParser:
                height += img.size[1]
            return pic

+        res = []
+        positions = []
+        figure_results = []
+        figure_positions = []
        # crop figure out and add caption
        for k, bxs in figures.items():
            txt = "\n".join([b["text"] for b in bxs])
@ -825,28 +825,46 @@ class RAGFlowPdfParser:
                continue

            poss = []
-            res.append(
-                (cropout(
-                    bxs,
-                    "figure", poss),
-                 [txt]))
-            positions.append(poss)
+
+            if separate_tables_figures:
+                figure_results.append(
+                    (cropout(
+                        bxs,
+                        "figure", poss),
+                     [txt]))
+                figure_positions.append(poss)
+            else:
+                res.append(
+                    (cropout(
+                        bxs,
+                        "figure", poss),
+                     [txt]))
+                positions.append(poss)

        for k, bxs in tables.items():
            if not bxs:
                continue
            bxs = Recognizer.sort_Y_firstly(bxs, np.mean(
                [(b["bottom"] - b["top"]) / 2 for b in bxs]))
+
            poss = []
+
            res.append((cropout(bxs, "table", poss),
                        self.tbl_det.construct_table(bxs, html=return_html, is_english=self.is_english)))
            positions.append(poss)

-        assert len(positions) == len(res)
-
-        if need_position:
-            return list(zip(res, positions))
-        return res
+        if separate_tables_figures:
+            assert len(positions) + len(figure_positions) == len(res) + len(figure_results)
+            if need_position:
+                return list(zip(res, positions)), list(zip(figure_results, figure_positions))
+            else:
+                return res, figure_results
+        else:
+            assert len(positions) == len(res)
+            if need_position:
+                return list(zip(res, positions))
+            else:
+                return res

    def proj_match(self, line):
        if len(line) <= 2: