mirror of
https://github.com/infiniflow/ragflow.git
synced 2025-12-08 20:42:30 +08:00
Fix: crop index may out of range (#11341)
### What problem does this PR solve? Crop index may out of range. #11323 ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue)
This commit is contained in:
@ -338,12 +338,54 @@ class MinerUParser(RAGFlowPdfParser):
|
||||
return None, None
|
||||
return
|
||||
|
||||
if not getattr(self, "page_images", None):
|
||||
self.logger.warning("[MinerU] crop called without page images; skipping image generation.")
|
||||
if need_position:
|
||||
return None, None
|
||||
return
|
||||
|
||||
page_count = len(self.page_images)
|
||||
|
||||
filtered_poss = []
|
||||
for pns, left, right, top, bottom in poss:
|
||||
if not pns:
|
||||
self.logger.warning("[MinerU] Empty page index list in crop; skipping this position.")
|
||||
continue
|
||||
valid_pns = [p for p in pns if 0 <= p < page_count]
|
||||
if not valid_pns:
|
||||
self.logger.warning(f"[MinerU] All page indices {pns} out of range for {page_count} pages; skipping.")
|
||||
continue
|
||||
filtered_poss.append((valid_pns, left, right, top, bottom))
|
||||
|
||||
poss = filtered_poss
|
||||
if not poss:
|
||||
self.logger.warning("[MinerU] No valid positions after filtering; skip cropping.")
|
||||
if need_position:
|
||||
return None, None
|
||||
return
|
||||
|
||||
max_width = max(np.max([right - left for (_, left, right, _, _) in poss]), 6)
|
||||
GAP = 6
|
||||
pos = poss[0]
|
||||
poss.insert(0, ([pos[0][0]], pos[1], pos[2], max(0, pos[3] - 120), max(pos[3] - GAP, 0)))
|
||||
first_page_idx = pos[0][0]
|
||||
poss.insert(0, ([first_page_idx], pos[1], pos[2], max(0, pos[3] - 120), max(pos[3] - GAP, 0)))
|
||||
pos = poss[-1]
|
||||
poss.append(([pos[0][-1]], pos[1], pos[2], min(self.page_images[pos[0][-1]].size[1], pos[4] + GAP), min(self.page_images[pos[0][-1]].size[1], pos[4] + 120)))
|
||||
last_page_idx = pos[0][-1]
|
||||
if not (0 <= last_page_idx < page_count):
|
||||
self.logger.warning(f"[MinerU] Last page index {last_page_idx} out of range for {page_count} pages; skipping crop.")
|
||||
if need_position:
|
||||
return None, None
|
||||
return
|
||||
last_page_height = self.page_images[last_page_idx].size[1]
|
||||
poss.append(
|
||||
(
|
||||
[last_page_idx],
|
||||
pos[1],
|
||||
pos[2],
|
||||
min(last_page_height, pos[4] + GAP),
|
||||
min(last_page_height, pos[4] + 120),
|
||||
)
|
||||
)
|
||||
|
||||
positions = []
|
||||
for ii, (pns, left, right, top, bottom) in enumerate(poss):
|
||||
@ -353,7 +395,14 @@ class MinerUParser(RAGFlowPdfParser):
|
||||
bottom = top + 2
|
||||
|
||||
for pn in pns[1:]:
|
||||
if 0 <= pn - 1 < page_count:
|
||||
bottom += self.page_images[pn - 1].size[1]
|
||||
else:
|
||||
self.logger.warning(f"[MinerU] Page index {pn}-1 out of range for {page_count} pages during crop; skipping height accumulation.")
|
||||
|
||||
if not (0 <= pns[0] < page_count):
|
||||
self.logger.warning(f"[MinerU] Base page index {pns[0]} out of range for {page_count} pages during crop; skipping this segment.")
|
||||
continue
|
||||
|
||||
img0 = self.page_images[pns[0]]
|
||||
x0, y0, x1, y1 = int(left), int(top), int(right), int(min(bottom, img0.size[1]))
|
||||
@ -364,6 +413,9 @@ class MinerUParser(RAGFlowPdfParser):
|
||||
|
||||
bottom -= img0.size[1]
|
||||
for pn in pns[1:]:
|
||||
if not (0 <= pn < page_count):
|
||||
self.logger.warning(f"[MinerU] Page index {pn} out of range for {page_count} pages during crop; skipping this page.")
|
||||
continue
|
||||
page = self.page_images[pn]
|
||||
x0, y0, x1, y1 = int(left), 0, int(right), int(min(bottom, page.size[1]))
|
||||
cimgp = page.crop((x0, y0, x1, y1))
|
||||
|
||||
@ -1252,24 +1252,77 @@ class RAGFlowPdfParser:
|
||||
return None, None
|
||||
return
|
||||
|
||||
if not getattr(self, "page_images", None):
|
||||
logging.warning("crop called without page images; skipping image generation.")
|
||||
if need_position:
|
||||
return None, None
|
||||
return
|
||||
|
||||
page_count = len(self.page_images)
|
||||
|
||||
filtered_poss = []
|
||||
for pns, left, right, top, bottom in poss:
|
||||
if not pns:
|
||||
logging.warning("Empty page index list in crop; skipping this position.")
|
||||
continue
|
||||
valid_pns = [p for p in pns if 0 <= p < page_count]
|
||||
if not valid_pns:
|
||||
logging.warning(f"All page indices {pns} out of range for {page_count} pages; skipping.")
|
||||
continue
|
||||
filtered_poss.append((valid_pns, left, right, top, bottom))
|
||||
|
||||
poss = filtered_poss
|
||||
if not poss:
|
||||
logging.warning("No valid positions after filtering; skip cropping.")
|
||||
if need_position:
|
||||
return None, None
|
||||
return
|
||||
|
||||
max_width = max(np.max([right - left for (_, left, right, _, _) in poss]), 6)
|
||||
GAP = 6
|
||||
pos = poss[0]
|
||||
poss.insert(0, ([pos[0][0]], pos[1], pos[2], max(0, pos[3] - 120), max(pos[3] - GAP, 0)))
|
||||
first_page_idx = pos[0][0]
|
||||
poss.insert(0, ([first_page_idx], pos[1], pos[2], max(0, pos[3] - 120), max(pos[3] - GAP, 0)))
|
||||
pos = poss[-1]
|
||||
poss.append(([pos[0][-1]], pos[1], pos[2], min(self.page_images[pos[0][-1]].size[1] / ZM, pos[4] + GAP), min(self.page_images[pos[0][-1]].size[1] / ZM, pos[4] + 120)))
|
||||
last_page_idx = pos[0][-1]
|
||||
if not (0 <= last_page_idx < page_count):
|
||||
logging.warning(f"Last page index {last_page_idx} out of range for {page_count} pages; skipping crop.")
|
||||
if need_position:
|
||||
return None, None
|
||||
return
|
||||
last_page_height = self.page_images[last_page_idx].size[1] / ZM
|
||||
poss.append(
|
||||
(
|
||||
[last_page_idx],
|
||||
pos[1],
|
||||
pos[2],
|
||||
min(last_page_height, pos[4] + GAP),
|
||||
min(last_page_height, pos[4] + 120),
|
||||
)
|
||||
)
|
||||
|
||||
positions = []
|
||||
for ii, (pns, left, right, top, bottom) in enumerate(poss):
|
||||
right = left + max_width
|
||||
bottom *= ZM
|
||||
for pn in pns[1:]:
|
||||
if 0 <= pn - 1 < page_count:
|
||||
bottom += self.page_images[pn - 1].size[1]
|
||||
else:
|
||||
logging.warning(f"Page index {pn}-1 out of range for {page_count} pages during crop; skipping height accumulation.")
|
||||
|
||||
if not (0 <= pns[0] < page_count):
|
||||
logging.warning(f"Base page index {pns[0]} out of range for {page_count} pages during crop; skipping this segment.")
|
||||
continue
|
||||
|
||||
imgs.append(self.page_images[pns[0]].crop((left * ZM, top * ZM, right * ZM, min(bottom, self.page_images[pns[0]].size[1]))))
|
||||
if 0 < ii < len(poss) - 1:
|
||||
positions.append((pns[0] + self.page_from, left, right, top, min(bottom, self.page_images[pns[0]].size[1]) / ZM))
|
||||
bottom -= self.page_images[pns[0]].size[1]
|
||||
for pn in pns[1:]:
|
||||
if not (0 <= pn < page_count):
|
||||
logging.warning(f"Page index {pn} out of range for {page_count} pages during crop; skipping this page.")
|
||||
continue
|
||||
imgs.append(self.page_images[pn].crop((left * ZM, 0, right * ZM, min(bottom, self.page_images[pn].size[1]))))
|
||||
if 0 < ii < len(poss) - 1:
|
||||
positions.append((pn + self.page_from, left, right, 0, min(bottom, self.page_images[pn].size[1]) / ZM))
|
||||
|
||||
Reference in New Issue
Block a user