mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-02-03 09:05:07 +08:00
Fix: pdf page_number error (#12938)
### What problem does this PR solve? Fix: pdf page_number error #12937 ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue)
This commit is contained in:
@ -1103,7 +1103,30 @@ class RAGFlowPdfParser:
|
|||||||
|
|
||||||
def cropout(bxs, ltype, poss):
|
def cropout(bxs, ltype, poss):
|
||||||
nonlocal ZM
|
nonlocal ZM
|
||||||
pn = set([b["page_number"] - 1 for b in bxs])
|
max_page_index = len(self.page_images) - 1
|
||||||
|
|
||||||
|
def local_page_index(page_number):
|
||||||
|
idx = page_number - 1
|
||||||
|
if idx > max_page_index and self.page_from:
|
||||||
|
idx = page_number - 1 - self.page_from
|
||||||
|
return idx
|
||||||
|
|
||||||
|
pn = set()
|
||||||
|
for b in bxs:
|
||||||
|
idx = local_page_index(b["page_number"])
|
||||||
|
if 0 <= idx <= max_page_index:
|
||||||
|
pn.add(idx)
|
||||||
|
else:
|
||||||
|
logging.warning(
|
||||||
|
"Skip out-of-range page_number %s (page_from=%s, pages=%s)",
|
||||||
|
b.get("page_number"),
|
||||||
|
self.page_from,
|
||||||
|
len(self.page_images),
|
||||||
|
)
|
||||||
|
|
||||||
|
if not pn:
|
||||||
|
return None
|
||||||
|
|
||||||
if len(pn) < 2:
|
if len(pn) < 2:
|
||||||
pn = list(pn)[0]
|
pn = list(pn)[0]
|
||||||
ht = self.page_cum_height[pn]
|
ht = self.page_cum_height[pn]
|
||||||
@ -1122,12 +1145,16 @@ class RAGFlowPdfParser:
|
|||||||
return self.page_images[pn].crop((left * ZM, top * ZM, right * ZM, bott * ZM))
|
return self.page_images[pn].crop((left * ZM, top * ZM, right * ZM, bott * ZM))
|
||||||
pn = {}
|
pn = {}
|
||||||
for b in bxs:
|
for b in bxs:
|
||||||
p = b["page_number"] - 1
|
p = local_page_index(b["page_number"])
|
||||||
if p not in pn:
|
if 0 <= p <= max_page_index:
|
||||||
pn[p] = []
|
if p not in pn:
|
||||||
pn[p].append(b)
|
pn[p] = []
|
||||||
|
pn[p].append(b)
|
||||||
pn = sorted(pn.items(), key=lambda x: x[0])
|
pn = sorted(pn.items(), key=lambda x: x[0])
|
||||||
imgs = [cropout(arr, ltype, poss) for p, arr in pn]
|
imgs = [cropout(arr, ltype, poss) for p, arr in pn]
|
||||||
|
imgs = [img for img in imgs if img is not None]
|
||||||
|
if not imgs:
|
||||||
|
return None
|
||||||
pic = Image.new("RGB", (int(np.max([i.size[0] for i in imgs])), int(np.sum([m.size[1] for m in imgs]))), (245, 245, 245))
|
pic = Image.new("RGB", (int(np.max([i.size[0] for i in imgs])), int(np.sum([m.size[1] for m in imgs]))), (245, 245, 245))
|
||||||
height = 0
|
height = 0
|
||||||
for img in imgs:
|
for img in imgs:
|
||||||
@ -1148,10 +1175,16 @@ class RAGFlowPdfParser:
|
|||||||
poss = []
|
poss = []
|
||||||
|
|
||||||
if separate_tables_figures:
|
if separate_tables_figures:
|
||||||
figure_results.append((cropout(bxs, "figure", poss), [txt]))
|
img = cropout(bxs, "figure", poss)
|
||||||
|
if img is None:
|
||||||
|
continue
|
||||||
|
figure_results.append((img, [txt]))
|
||||||
figure_positions.append(poss)
|
figure_positions.append(poss)
|
||||||
else:
|
else:
|
||||||
res.append((cropout(bxs, "figure", poss), [txt]))
|
img = cropout(bxs, "figure", poss)
|
||||||
|
if img is None:
|
||||||
|
continue
|
||||||
|
res.append((img, [txt]))
|
||||||
positions.append(poss)
|
positions.append(poss)
|
||||||
|
|
||||||
for k, bxs in tables.items():
|
for k, bxs in tables.items():
|
||||||
@ -1161,7 +1194,10 @@ class RAGFlowPdfParser:
|
|||||||
|
|
||||||
poss = []
|
poss = []
|
||||||
|
|
||||||
res.append((cropout(bxs, "table", poss), self.tbl_det.construct_table(bxs, html=return_html, is_english=self.is_english)))
|
img = cropout(bxs, "table", poss)
|
||||||
|
if img is None:
|
||||||
|
continue
|
||||||
|
res.append((img, self.tbl_det.construct_table(bxs, html=return_html, is_english=self.is_english)))
|
||||||
positions.append(poss)
|
positions.append(poss)
|
||||||
|
|
||||||
if separate_tables_figures:
|
if separate_tables_figures:
|
||||||
|
|||||||
Reference in New Issue
Block a user