Fix: crop index may out of range (#11341)

### What problem does this PR solve?

Crop index may out of range. #11323


### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
This commit is contained in:
Yongteng Lei
2025-11-18 17:01:54 +08:00
committed by GitHub
parent 341e5904c8
commit c2b7c305fa
2 changed files with 111 additions and 6 deletions

View File

@ -338,12 +338,54 @@ class MinerUParser(RAGFlowPdfParser):
return None, None
return
if not getattr(self, "page_images", None):
self.logger.warning("[MinerU] crop called without page images; skipping image generation.")
if need_position:
return None, None
return
page_count = len(self.page_images)
filtered_poss = []
for pns, left, right, top, bottom in poss:
if not pns:
self.logger.warning("[MinerU] Empty page index list in crop; skipping this position.")
continue
valid_pns = [p for p in pns if 0 <= p < page_count]
if not valid_pns:
self.logger.warning(f"[MinerU] All page indices {pns} out of range for {page_count} pages; skipping.")
continue
filtered_poss.append((valid_pns, left, right, top, bottom))
poss = filtered_poss
if not poss:
self.logger.warning("[MinerU] No valid positions after filtering; skip cropping.")
if need_position:
return None, None
return
max_width = max(np.max([right - left for (_, left, right, _, _) in poss]), 6)
GAP = 6
pos = poss[0]
poss.insert(0, ([pos[0][0]], pos[1], pos[2], max(0, pos[3] - 120), max(pos[3] - GAP, 0)))
first_page_idx = pos[0][0]
poss.insert(0, ([first_page_idx], pos[1], pos[2], max(0, pos[3] - 120), max(pos[3] - GAP, 0)))
pos = poss[-1]
poss.append(([pos[0][-1]], pos[1], pos[2], min(self.page_images[pos[0][-1]].size[1], pos[4] + GAP), min(self.page_images[pos[0][-1]].size[1], pos[4] + 120)))
last_page_idx = pos[0][-1]
if not (0 <= last_page_idx < page_count):
self.logger.warning(f"[MinerU] Last page index {last_page_idx} out of range for {page_count} pages; skipping crop.")
if need_position:
return None, None
return
last_page_height = self.page_images[last_page_idx].size[1]
poss.append(
(
[last_page_idx],
pos[1],
pos[2],
min(last_page_height, pos[4] + GAP),
min(last_page_height, pos[4] + 120),
)
)
positions = []
for ii, (pns, left, right, top, bottom) in enumerate(poss):
@ -353,7 +395,14 @@ class MinerUParser(RAGFlowPdfParser):
bottom = top + 2
for pn in pns[1:]:
bottom += self.page_images[pn - 1].size[1]
if 0 <= pn - 1 < page_count:
bottom += self.page_images[pn - 1].size[1]
else:
self.logger.warning(f"[MinerU] Page index {pn}-1 out of range for {page_count} pages during crop; skipping height accumulation.")
if not (0 <= pns[0] < page_count):
self.logger.warning(f"[MinerU] Base page index {pns[0]} out of range for {page_count} pages during crop; skipping this segment.")
continue
img0 = self.page_images[pns[0]]
x0, y0, x1, y1 = int(left), int(top), int(right), int(min(bottom, img0.size[1]))
@ -364,6 +413,9 @@ class MinerUParser(RAGFlowPdfParser):
bottom -= img0.size[1]
for pn in pns[1:]:
if not (0 <= pn < page_count):
self.logger.warning(f"[MinerU] Page index {pn} out of range for {page_count} pages during crop; skipping this page.")
continue
page = self.page_images[pn]
x0, y0, x1, y1 = int(left), 0, int(right), int(min(bottom, page.size[1]))
cimgp = page.crop((x0, y0, x1, y1))

View File

@ -1252,24 +1252,77 @@ class RAGFlowPdfParser:
return None, None
return
if not getattr(self, "page_images", None):
logging.warning("crop called without page images; skipping image generation.")
if need_position:
return None, None
return
page_count = len(self.page_images)
filtered_poss = []
for pns, left, right, top, bottom in poss:
if not pns:
logging.warning("Empty page index list in crop; skipping this position.")
continue
valid_pns = [p for p in pns if 0 <= p < page_count]
if not valid_pns:
logging.warning(f"All page indices {pns} out of range for {page_count} pages; skipping.")
continue
filtered_poss.append((valid_pns, left, right, top, bottom))
poss = filtered_poss
if not poss:
logging.warning("No valid positions after filtering; skip cropping.")
if need_position:
return None, None
return
max_width = max(np.max([right - left for (_, left, right, _, _) in poss]), 6)
GAP = 6
pos = poss[0]
poss.insert(0, ([pos[0][0]], pos[1], pos[2], max(0, pos[3] - 120), max(pos[3] - GAP, 0)))
first_page_idx = pos[0][0]
poss.insert(0, ([first_page_idx], pos[1], pos[2], max(0, pos[3] - 120), max(pos[3] - GAP, 0)))
pos = poss[-1]
poss.append(([pos[0][-1]], pos[1], pos[2], min(self.page_images[pos[0][-1]].size[1] / ZM, pos[4] + GAP), min(self.page_images[pos[0][-1]].size[1] / ZM, pos[4] + 120)))
last_page_idx = pos[0][-1]
if not (0 <= last_page_idx < page_count):
logging.warning(f"Last page index {last_page_idx} out of range for {page_count} pages; skipping crop.")
if need_position:
return None, None
return
last_page_height = self.page_images[last_page_idx].size[1] / ZM
poss.append(
(
[last_page_idx],
pos[1],
pos[2],
min(last_page_height, pos[4] + GAP),
min(last_page_height, pos[4] + 120),
)
)
positions = []
for ii, (pns, left, right, top, bottom) in enumerate(poss):
right = left + max_width
bottom *= ZM
for pn in pns[1:]:
bottom += self.page_images[pn - 1].size[1]
if 0 <= pn - 1 < page_count:
bottom += self.page_images[pn - 1].size[1]
else:
logging.warning(f"Page index {pn}-1 out of range for {page_count} pages during crop; skipping height accumulation.")
if not (0 <= pns[0] < page_count):
logging.warning(f"Base page index {pns[0]} out of range for {page_count} pages during crop; skipping this segment.")
continue
imgs.append(self.page_images[pns[0]].crop((left * ZM, top * ZM, right * ZM, min(bottom, self.page_images[pns[0]].size[1]))))
if 0 < ii < len(poss) - 1:
positions.append((pns[0] + self.page_from, left, right, top, min(bottom, self.page_images[pns[0]].size[1]) / ZM))
bottom -= self.page_images[pns[0]].size[1]
for pn in pns[1:]:
if not (0 <= pn < page_count):
logging.warning(f"Page index {pn} out of range for {page_count} pages during crop; skipping this page.")
continue
imgs.append(self.page_images[pn].crop((left * ZM, 0, right * ZM, min(bottom, self.page_images[pn].size[1]))))
if 0 < ii < len(poss) - 1:
positions.append((pn + self.page_from, left, right, 0, min(bottom, self.page_images[pn].size[1]) / ZM))