mirror of
https://github.com/infiniflow/ragflow.git
synced 2025-12-08 20:42:30 +08:00
Feat: add VLM-boosted PDF parser (#6278)
### What problem does this PR solve? Add VLM-boosted PDF parser if VLM is set. ### Type of change - [x] New Feature (non-breaking change which adds functionality)
This commit is contained in:
82
deepdoc/parser/figure_parser.py
Normal file
82
deepdoc/parser/figure_parser.py
Normal file
@ -0,0 +1,82 @@
|
||||
#
|
||||
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
|
||||
from rag.app.picture import vision_llm_chunk as picture_vision_llm_chunk
|
||||
from rag.prompts import vision_llm_figure_describe_prompt
|
||||
|
||||
|
||||
class VisionFigureParser:
|
||||
def __init__(self, vision_model, figures_data, *args, **kwargs):
|
||||
self.vision_model = vision_model
|
||||
self._extract_figures_info(figures_data)
|
||||
assert len(self.figures) == len(self.descriptions)
|
||||
assert not self.positions or (len(self.figures) == len(self.positions))
|
||||
|
||||
def _extract_figures_info(self, figures_data):
|
||||
self.figures = []
|
||||
self.descriptions = []
|
||||
self.positions = []
|
||||
|
||||
for item in figures_data:
|
||||
# position
|
||||
if len(item) == 2 and isinstance(item[1], list) and len(item[1]) == 1 and len(item[1][0]) == 5:
|
||||
img_desc = item[0]
|
||||
assert len(img_desc) == 2, "Should be (figure, [description])"
|
||||
self.figures.append(img_desc[0])
|
||||
self.descriptions.append(img_desc[1])
|
||||
self.positions.append(item[1])
|
||||
else:
|
||||
assert len(item) == 2 and isinstance(item, tuple), f"get {len(item)=}, {item=}"
|
||||
self.figures.append(item[0])
|
||||
self.descriptions.append(item[1])
|
||||
|
||||
def _assemble(self):
|
||||
self.assembled = []
|
||||
self.has_positions = len(self.positions) != 0
|
||||
for i in range(len(self.figures)):
|
||||
figure = self.figures[i]
|
||||
desc = self.descriptions[i]
|
||||
pos = self.positions[i] if self.has_positions else None
|
||||
|
||||
figure_desc = (figure, desc)
|
||||
|
||||
if pos is not None:
|
||||
self.assembled.append((figure_desc, pos))
|
||||
else:
|
||||
self.assembled.append((figure_desc,))
|
||||
|
||||
return self.assembled
|
||||
|
||||
def __call__(self, **kwargs):
|
||||
callback = kwargs.get("callback", lambda prog, msg: None)
|
||||
|
||||
for idx, img_binary in enumerate(self.figures or []):
|
||||
figure_num = idx # 0-based
|
||||
|
||||
txt = picture_vision_llm_chunk(
|
||||
binary=img_binary,
|
||||
vision_model=self.vision_model,
|
||||
prompt=vision_llm_figure_describe_prompt(),
|
||||
callback=callback,
|
||||
)
|
||||
|
||||
if txt:
|
||||
self.descriptions[figure_num] = txt + "\n".join(self.descriptions[figure_num])
|
||||
|
||||
self._assemble()
|
||||
|
||||
return self.assembled
|
||||
@ -653,8 +653,7 @@ class RAGFlowPdfParser:
|
||||
b_["top"] = b["top"]
|
||||
self.boxes.pop(i)
|
||||
|
||||
def _extract_table_figure(self, need_image, ZM,
|
||||
return_html, need_position):
|
||||
def _extract_table_figure(self, need_image, ZM, return_html, need_position, separate_tables_figures=False):
|
||||
tables = {}
|
||||
figures = {}
|
||||
# extract figure and table boxes
|
||||
@ -768,9 +767,6 @@ class RAGFlowPdfParser:
|
||||
tk)
|
||||
self.boxes.pop(i)
|
||||
|
||||
res = []
|
||||
positions = []
|
||||
|
||||
def cropout(bxs, ltype, poss):
|
||||
nonlocal ZM
|
||||
pn = set([b["page_number"] - 1 for b in bxs])
|
||||
@ -818,6 +814,10 @@ class RAGFlowPdfParser:
|
||||
height += img.size[1]
|
||||
return pic
|
||||
|
||||
res = []
|
||||
positions = []
|
||||
figure_results = []
|
||||
figure_positions = []
|
||||
# crop figure out and add caption
|
||||
for k, bxs in figures.items():
|
||||
txt = "\n".join([b["text"] for b in bxs])
|
||||
@ -825,28 +825,46 @@ class RAGFlowPdfParser:
|
||||
continue
|
||||
|
||||
poss = []
|
||||
res.append(
|
||||
(cropout(
|
||||
bxs,
|
||||
"figure", poss),
|
||||
[txt]))
|
||||
positions.append(poss)
|
||||
|
||||
if separate_tables_figures:
|
||||
figure_results.append(
|
||||
(cropout(
|
||||
bxs,
|
||||
"figure", poss),
|
||||
[txt]))
|
||||
figure_positions.append(poss)
|
||||
else:
|
||||
res.append(
|
||||
(cropout(
|
||||
bxs,
|
||||
"figure", poss),
|
||||
[txt]))
|
||||
positions.append(poss)
|
||||
|
||||
for k, bxs in tables.items():
|
||||
if not bxs:
|
||||
continue
|
||||
bxs = Recognizer.sort_Y_firstly(bxs, np.mean(
|
||||
[(b["bottom"] - b["top"]) / 2 for b in bxs]))
|
||||
|
||||
poss = []
|
||||
|
||||
res.append((cropout(bxs, "table", poss),
|
||||
self.tbl_det.construct_table(bxs, html=return_html, is_english=self.is_english)))
|
||||
positions.append(poss)
|
||||
|
||||
assert len(positions) == len(res)
|
||||
|
||||
if need_position:
|
||||
return list(zip(res, positions))
|
||||
return res
|
||||
if separate_tables_figures:
|
||||
assert len(positions) + len(figure_positions) == len(res) + len(figure_results)
|
||||
if need_position:
|
||||
return list(zip(res, positions)), list(zip(figure_results, figure_positions))
|
||||
else:
|
||||
return res, figure_results
|
||||
else:
|
||||
assert len(positions) == len(res)
|
||||
if need_position:
|
||||
return list(zip(res, positions))
|
||||
else:
|
||||
return res
|
||||
|
||||
def proj_match(self, line):
|
||||
if len(line) <= 2:
|
||||
|
||||
Reference in New Issue
Block a user