mirror of
https://github.com/infiniflow/ragflow.git
synced 2025-12-08 20:42:30 +08:00
Fix: crop index may out of range (#11341)
### What problem does this PR solve? Crop index may out of range. #11323 ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue)
This commit is contained in:
@ -338,12 +338,54 @@ class MinerUParser(RAGFlowPdfParser):
|
|||||||
return None, None
|
return None, None
|
||||||
return
|
return
|
||||||
|
|
||||||
|
if not getattr(self, "page_images", None):
|
||||||
|
self.logger.warning("[MinerU] crop called without page images; skipping image generation.")
|
||||||
|
if need_position:
|
||||||
|
return None, None
|
||||||
|
return
|
||||||
|
|
||||||
|
page_count = len(self.page_images)
|
||||||
|
|
||||||
|
filtered_poss = []
|
||||||
|
for pns, left, right, top, bottom in poss:
|
||||||
|
if not pns:
|
||||||
|
self.logger.warning("[MinerU] Empty page index list in crop; skipping this position.")
|
||||||
|
continue
|
||||||
|
valid_pns = [p for p in pns if 0 <= p < page_count]
|
||||||
|
if not valid_pns:
|
||||||
|
self.logger.warning(f"[MinerU] All page indices {pns} out of range for {page_count} pages; skipping.")
|
||||||
|
continue
|
||||||
|
filtered_poss.append((valid_pns, left, right, top, bottom))
|
||||||
|
|
||||||
|
poss = filtered_poss
|
||||||
|
if not poss:
|
||||||
|
self.logger.warning("[MinerU] No valid positions after filtering; skip cropping.")
|
||||||
|
if need_position:
|
||||||
|
return None, None
|
||||||
|
return
|
||||||
|
|
||||||
max_width = max(np.max([right - left for (_, left, right, _, _) in poss]), 6)
|
max_width = max(np.max([right - left for (_, left, right, _, _) in poss]), 6)
|
||||||
GAP = 6
|
GAP = 6
|
||||||
pos = poss[0]
|
pos = poss[0]
|
||||||
poss.insert(0, ([pos[0][0]], pos[1], pos[2], max(0, pos[3] - 120), max(pos[3] - GAP, 0)))
|
first_page_idx = pos[0][0]
|
||||||
|
poss.insert(0, ([first_page_idx], pos[1], pos[2], max(0, pos[3] - 120), max(pos[3] - GAP, 0)))
|
||||||
pos = poss[-1]
|
pos = poss[-1]
|
||||||
poss.append(([pos[0][-1]], pos[1], pos[2], min(self.page_images[pos[0][-1]].size[1], pos[4] + GAP), min(self.page_images[pos[0][-1]].size[1], pos[4] + 120)))
|
last_page_idx = pos[0][-1]
|
||||||
|
if not (0 <= last_page_idx < page_count):
|
||||||
|
self.logger.warning(f"[MinerU] Last page index {last_page_idx} out of range for {page_count} pages; skipping crop.")
|
||||||
|
if need_position:
|
||||||
|
return None, None
|
||||||
|
return
|
||||||
|
last_page_height = self.page_images[last_page_idx].size[1]
|
||||||
|
poss.append(
|
||||||
|
(
|
||||||
|
[last_page_idx],
|
||||||
|
pos[1],
|
||||||
|
pos[2],
|
||||||
|
min(last_page_height, pos[4] + GAP),
|
||||||
|
min(last_page_height, pos[4] + 120),
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
positions = []
|
positions = []
|
||||||
for ii, (pns, left, right, top, bottom) in enumerate(poss):
|
for ii, (pns, left, right, top, bottom) in enumerate(poss):
|
||||||
@ -353,7 +395,14 @@ class MinerUParser(RAGFlowPdfParser):
|
|||||||
bottom = top + 2
|
bottom = top + 2
|
||||||
|
|
||||||
for pn in pns[1:]:
|
for pn in pns[1:]:
|
||||||
bottom += self.page_images[pn - 1].size[1]
|
if 0 <= pn - 1 < page_count:
|
||||||
|
bottom += self.page_images[pn - 1].size[1]
|
||||||
|
else:
|
||||||
|
self.logger.warning(f"[MinerU] Page index {pn}-1 out of range for {page_count} pages during crop; skipping height accumulation.")
|
||||||
|
|
||||||
|
if not (0 <= pns[0] < page_count):
|
||||||
|
self.logger.warning(f"[MinerU] Base page index {pns[0]} out of range for {page_count} pages during crop; skipping this segment.")
|
||||||
|
continue
|
||||||
|
|
||||||
img0 = self.page_images[pns[0]]
|
img0 = self.page_images[pns[0]]
|
||||||
x0, y0, x1, y1 = int(left), int(top), int(right), int(min(bottom, img0.size[1]))
|
x0, y0, x1, y1 = int(left), int(top), int(right), int(min(bottom, img0.size[1]))
|
||||||
@ -364,6 +413,9 @@ class MinerUParser(RAGFlowPdfParser):
|
|||||||
|
|
||||||
bottom -= img0.size[1]
|
bottom -= img0.size[1]
|
||||||
for pn in pns[1:]:
|
for pn in pns[1:]:
|
||||||
|
if not (0 <= pn < page_count):
|
||||||
|
self.logger.warning(f"[MinerU] Page index {pn} out of range for {page_count} pages during crop; skipping this page.")
|
||||||
|
continue
|
||||||
page = self.page_images[pn]
|
page = self.page_images[pn]
|
||||||
x0, y0, x1, y1 = int(left), 0, int(right), int(min(bottom, page.size[1]))
|
x0, y0, x1, y1 = int(left), 0, int(right), int(min(bottom, page.size[1]))
|
||||||
cimgp = page.crop((x0, y0, x1, y1))
|
cimgp = page.crop((x0, y0, x1, y1))
|
||||||
|
|||||||
@ -1252,24 +1252,77 @@ class RAGFlowPdfParser:
|
|||||||
return None, None
|
return None, None
|
||||||
return
|
return
|
||||||
|
|
||||||
|
if not getattr(self, "page_images", None):
|
||||||
|
logging.warning("crop called without page images; skipping image generation.")
|
||||||
|
if need_position:
|
||||||
|
return None, None
|
||||||
|
return
|
||||||
|
|
||||||
|
page_count = len(self.page_images)
|
||||||
|
|
||||||
|
filtered_poss = []
|
||||||
|
for pns, left, right, top, bottom in poss:
|
||||||
|
if not pns:
|
||||||
|
logging.warning("Empty page index list in crop; skipping this position.")
|
||||||
|
continue
|
||||||
|
valid_pns = [p for p in pns if 0 <= p < page_count]
|
||||||
|
if not valid_pns:
|
||||||
|
logging.warning(f"All page indices {pns} out of range for {page_count} pages; skipping.")
|
||||||
|
continue
|
||||||
|
filtered_poss.append((valid_pns, left, right, top, bottom))
|
||||||
|
|
||||||
|
poss = filtered_poss
|
||||||
|
if not poss:
|
||||||
|
logging.warning("No valid positions after filtering; skip cropping.")
|
||||||
|
if need_position:
|
||||||
|
return None, None
|
||||||
|
return
|
||||||
|
|
||||||
max_width = max(np.max([right - left for (_, left, right, _, _) in poss]), 6)
|
max_width = max(np.max([right - left for (_, left, right, _, _) in poss]), 6)
|
||||||
GAP = 6
|
GAP = 6
|
||||||
pos = poss[0]
|
pos = poss[0]
|
||||||
poss.insert(0, ([pos[0][0]], pos[1], pos[2], max(0, pos[3] - 120), max(pos[3] - GAP, 0)))
|
first_page_idx = pos[0][0]
|
||||||
|
poss.insert(0, ([first_page_idx], pos[1], pos[2], max(0, pos[3] - 120), max(pos[3] - GAP, 0)))
|
||||||
pos = poss[-1]
|
pos = poss[-1]
|
||||||
poss.append(([pos[0][-1]], pos[1], pos[2], min(self.page_images[pos[0][-1]].size[1] / ZM, pos[4] + GAP), min(self.page_images[pos[0][-1]].size[1] / ZM, pos[4] + 120)))
|
last_page_idx = pos[0][-1]
|
||||||
|
if not (0 <= last_page_idx < page_count):
|
||||||
|
logging.warning(f"Last page index {last_page_idx} out of range for {page_count} pages; skipping crop.")
|
||||||
|
if need_position:
|
||||||
|
return None, None
|
||||||
|
return
|
||||||
|
last_page_height = self.page_images[last_page_idx].size[1] / ZM
|
||||||
|
poss.append(
|
||||||
|
(
|
||||||
|
[last_page_idx],
|
||||||
|
pos[1],
|
||||||
|
pos[2],
|
||||||
|
min(last_page_height, pos[4] + GAP),
|
||||||
|
min(last_page_height, pos[4] + 120),
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
positions = []
|
positions = []
|
||||||
for ii, (pns, left, right, top, bottom) in enumerate(poss):
|
for ii, (pns, left, right, top, bottom) in enumerate(poss):
|
||||||
right = left + max_width
|
right = left + max_width
|
||||||
bottom *= ZM
|
bottom *= ZM
|
||||||
for pn in pns[1:]:
|
for pn in pns[1:]:
|
||||||
bottom += self.page_images[pn - 1].size[1]
|
if 0 <= pn - 1 < page_count:
|
||||||
|
bottom += self.page_images[pn - 1].size[1]
|
||||||
|
else:
|
||||||
|
logging.warning(f"Page index {pn}-1 out of range for {page_count} pages during crop; skipping height accumulation.")
|
||||||
|
|
||||||
|
if not (0 <= pns[0] < page_count):
|
||||||
|
logging.warning(f"Base page index {pns[0]} out of range for {page_count} pages during crop; skipping this segment.")
|
||||||
|
continue
|
||||||
|
|
||||||
imgs.append(self.page_images[pns[0]].crop((left * ZM, top * ZM, right * ZM, min(bottom, self.page_images[pns[0]].size[1]))))
|
imgs.append(self.page_images[pns[0]].crop((left * ZM, top * ZM, right * ZM, min(bottom, self.page_images[pns[0]].size[1]))))
|
||||||
if 0 < ii < len(poss) - 1:
|
if 0 < ii < len(poss) - 1:
|
||||||
positions.append((pns[0] + self.page_from, left, right, top, min(bottom, self.page_images[pns[0]].size[1]) / ZM))
|
positions.append((pns[0] + self.page_from, left, right, top, min(bottom, self.page_images[pns[0]].size[1]) / ZM))
|
||||||
bottom -= self.page_images[pns[0]].size[1]
|
bottom -= self.page_images[pns[0]].size[1]
|
||||||
for pn in pns[1:]:
|
for pn in pns[1:]:
|
||||||
|
if not (0 <= pn < page_count):
|
||||||
|
logging.warning(f"Page index {pn} out of range for {page_count} pages during crop; skipping this page.")
|
||||||
|
continue
|
||||||
imgs.append(self.page_images[pn].crop((left * ZM, 0, right * ZM, min(bottom, self.page_images[pn].size[1]))))
|
imgs.append(self.page_images[pn].crop((left * ZM, 0, right * ZM, min(bottom, self.page_images[pn].size[1]))))
|
||||||
if 0 < ii < len(poss) - 1:
|
if 0 < ii < len(poss) - 1:
|
||||||
positions.append((pn + self.page_from, left, right, 0, min(bottom, self.page_images[pn].size[1]) / ZM))
|
positions.append((pn + self.page_from, left, right, 0, min(bottom, self.page_images[pn].size[1]) / ZM))
|
||||||
|
|||||||
Reference in New Issue
Block a user