Feat: add VLM-boosted PDF parser (#6278)

### What problem does this PR solve?

Add VLM-boosted PDF parser if VLM is set.

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
This commit is contained in:
Yongteng Lei
2025-03-20 09:39:32 +08:00
committed by GitHub
parent 344727f9ba
commit 1d6760dd84
5 changed files with 181 additions and 33 deletions

View File

@ -0,0 +1,82 @@
#
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
from rag.app.picture import vision_llm_chunk as picture_vision_llm_chunk
from rag.prompts import vision_llm_figure_describe_prompt
class VisionFigureParser:
def __init__(self, vision_model, figures_data, *args, **kwargs):
self.vision_model = vision_model
self._extract_figures_info(figures_data)
assert len(self.figures) == len(self.descriptions)
assert not self.positions or (len(self.figures) == len(self.positions))
def _extract_figures_info(self, figures_data):
self.figures = []
self.descriptions = []
self.positions = []
for item in figures_data:
# position
if len(item) == 2 and isinstance(item[1], list) and len(item[1]) == 1 and len(item[1][0]) == 5:
img_desc = item[0]
assert len(img_desc) == 2, "Should be (figure, [description])"
self.figures.append(img_desc[0])
self.descriptions.append(img_desc[1])
self.positions.append(item[1])
else:
assert len(item) == 2 and isinstance(item, tuple), f"get {len(item)=}, {item=}"
self.figures.append(item[0])
self.descriptions.append(item[1])
def _assemble(self):
self.assembled = []
self.has_positions = len(self.positions) != 0
for i in range(len(self.figures)):
figure = self.figures[i]
desc = self.descriptions[i]
pos = self.positions[i] if self.has_positions else None
figure_desc = (figure, desc)
if pos is not None:
self.assembled.append((figure_desc, pos))
else:
self.assembled.append((figure_desc,))
return self.assembled
def __call__(self, **kwargs):
callback = kwargs.get("callback", lambda prog, msg: None)
for idx, img_binary in enumerate(self.figures or []):
figure_num = idx # 0-based
txt = picture_vision_llm_chunk(
binary=img_binary,
vision_model=self.vision_model,
prompt=vision_llm_figure_describe_prompt(),
callback=callback,
)
if txt:
self.descriptions[figure_num] = txt + "\n".join(self.descriptions[figure_num])
self._assemble()
return self.assembled

View File

@ -653,8 +653,7 @@ class RAGFlowPdfParser:
b_["top"] = b["top"]
self.boxes.pop(i)
def _extract_table_figure(self, need_image, ZM,
return_html, need_position):
def _extract_table_figure(self, need_image, ZM, return_html, need_position, separate_tables_figures=False):
tables = {}
figures = {}
# extract figure and table boxes
@ -768,9 +767,6 @@ class RAGFlowPdfParser:
tk)
self.boxes.pop(i)
res = []
positions = []
def cropout(bxs, ltype, poss):
nonlocal ZM
pn = set([b["page_number"] - 1 for b in bxs])
@ -818,6 +814,10 @@ class RAGFlowPdfParser:
height += img.size[1]
return pic
res = []
positions = []
figure_results = []
figure_positions = []
# crop figure out and add caption
for k, bxs in figures.items():
txt = "\n".join([b["text"] for b in bxs])
@ -825,28 +825,46 @@ class RAGFlowPdfParser:
continue
poss = []
res.append(
(cropout(
bxs,
"figure", poss),
[txt]))
positions.append(poss)
if separate_tables_figures:
figure_results.append(
(cropout(
bxs,
"figure", poss),
[txt]))
figure_positions.append(poss)
else:
res.append(
(cropout(
bxs,
"figure", poss),
[txt]))
positions.append(poss)
for k, bxs in tables.items():
if not bxs:
continue
bxs = Recognizer.sort_Y_firstly(bxs, np.mean(
[(b["bottom"] - b["top"]) / 2 for b in bxs]))
poss = []
res.append((cropout(bxs, "table", poss),
self.tbl_det.construct_table(bxs, html=return_html, is_english=self.is_english)))
positions.append(poss)
assert len(positions) == len(res)
if need_position:
return list(zip(res, positions))
return res
if separate_tables_figures:
assert len(positions) + len(figure_positions) == len(res) + len(figure_results)
if need_position:
return list(zip(res, positions)), list(zip(figure_results, figure_positions))
else:
return res, figure_results
else:
assert len(positions) == len(res)
if need_position:
return list(zip(res, positions))
else:
return res
def proj_match(self, line):
if len(line) <= 2: