From c2b7c305fa8f1b8cc199dc4e470a3472221bc7e5 Mon Sep 17 00:00:00 2001 From: Yongteng Lei Date: Tue, 18 Nov 2025 17:01:54 +0800 Subject: [PATCH] Fix: crop index may out of range (#11341) ### What problem does this PR solve? Crop index may out of range. #11323 ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue) --- deepdoc/parser/mineru_parser.py | 58 ++++++++++++++++++++++++++++++-- deepdoc/parser/pdf_parser.py | 59 +++++++++++++++++++++++++++++++-- 2 files changed, 111 insertions(+), 6 deletions(-) diff --git a/deepdoc/parser/mineru_parser.py b/deepdoc/parser/mineru_parser.py index 99b56e83a..e3cb62cc7 100644 --- a/deepdoc/parser/mineru_parser.py +++ b/deepdoc/parser/mineru_parser.py @@ -338,12 +338,54 @@ class MinerUParser(RAGFlowPdfParser): return None, None return + if not getattr(self, "page_images", None): + self.logger.warning("[MinerU] crop called without page images; skipping image generation.") + if need_position: + return None, None + return + + page_count = len(self.page_images) + + filtered_poss = [] + for pns, left, right, top, bottom in poss: + if not pns: + self.logger.warning("[MinerU] Empty page index list in crop; skipping this position.") + continue + valid_pns = [p for p in pns if 0 <= p < page_count] + if not valid_pns: + self.logger.warning(f"[MinerU] All page indices {pns} out of range for {page_count} pages; skipping.") + continue + filtered_poss.append((valid_pns, left, right, top, bottom)) + + poss = filtered_poss + if not poss: + self.logger.warning("[MinerU] No valid positions after filtering; skip cropping.") + if need_position: + return None, None + return + max_width = max(np.max([right - left for (_, left, right, _, _) in poss]), 6) GAP = 6 pos = poss[0] - poss.insert(0, ([pos[0][0]], pos[1], pos[2], max(0, pos[3] - 120), max(pos[3] - GAP, 0))) + first_page_idx = pos[0][0] + poss.insert(0, ([first_page_idx], pos[1], pos[2], max(0, pos[3] - 120), max(pos[3] - GAP, 0))) pos = poss[-1] - poss.append(([pos[0][-1]], pos[1], pos[2], min(self.page_images[pos[0][-1]].size[1], pos[4] + GAP), min(self.page_images[pos[0][-1]].size[1], pos[4] + 120))) + last_page_idx = pos[0][-1] + if not (0 <= last_page_idx < page_count): + self.logger.warning(f"[MinerU] Last page index {last_page_idx} out of range for {page_count} pages; skipping crop.") + if need_position: + return None, None + return + last_page_height = self.page_images[last_page_idx].size[1] + poss.append( + ( + [last_page_idx], + pos[1], + pos[2], + min(last_page_height, pos[4] + GAP), + min(last_page_height, pos[4] + 120), + ) + ) positions = [] for ii, (pns, left, right, top, bottom) in enumerate(poss): @@ -353,7 +395,14 @@ class MinerUParser(RAGFlowPdfParser): bottom = top + 2 for pn in pns[1:]: - bottom += self.page_images[pn - 1].size[1] + if 0 <= pn - 1 < page_count: + bottom += self.page_images[pn - 1].size[1] + else: + self.logger.warning(f"[MinerU] Page index {pn}-1 out of range for {page_count} pages during crop; skipping height accumulation.") + + if not (0 <= pns[0] < page_count): + self.logger.warning(f"[MinerU] Base page index {pns[0]} out of range for {page_count} pages during crop; skipping this segment.") + continue img0 = self.page_images[pns[0]] x0, y0, x1, y1 = int(left), int(top), int(right), int(min(bottom, img0.size[1])) @@ -364,6 +413,9 @@ class MinerUParser(RAGFlowPdfParser): bottom -= img0.size[1] for pn in pns[1:]: + if not (0 <= pn < page_count): + self.logger.warning(f"[MinerU] Page index {pn} out of range for {page_count} pages during crop; skipping this page.") + continue page = self.page_images[pn] x0, y0, x1, y1 = int(left), 0, int(right), int(min(bottom, page.size[1])) cimgp = page.crop((x0, y0, x1, y1)) diff --git a/deepdoc/parser/pdf_parser.py b/deepdoc/parser/pdf_parser.py index 6550c49cd..5bc877a6a 100644 --- a/deepdoc/parser/pdf_parser.py +++ b/deepdoc/parser/pdf_parser.py @@ -1252,24 +1252,77 @@ class RAGFlowPdfParser: return None, None return + if not getattr(self, "page_images", None): + logging.warning("crop called without page images; skipping image generation.") + if need_position: + return None, None + return + + page_count = len(self.page_images) + + filtered_poss = [] + for pns, left, right, top, bottom in poss: + if not pns: + logging.warning("Empty page index list in crop; skipping this position.") + continue + valid_pns = [p for p in pns if 0 <= p < page_count] + if not valid_pns: + logging.warning(f"All page indices {pns} out of range for {page_count} pages; skipping.") + continue + filtered_poss.append((valid_pns, left, right, top, bottom)) + + poss = filtered_poss + if not poss: + logging.warning("No valid positions after filtering; skip cropping.") + if need_position: + return None, None + return + max_width = max(np.max([right - left for (_, left, right, _, _) in poss]), 6) GAP = 6 pos = poss[0] - poss.insert(0, ([pos[0][0]], pos[1], pos[2], max(0, pos[3] - 120), max(pos[3] - GAP, 0))) + first_page_idx = pos[0][0] + poss.insert(0, ([first_page_idx], pos[1], pos[2], max(0, pos[3] - 120), max(pos[3] - GAP, 0))) pos = poss[-1] - poss.append(([pos[0][-1]], pos[1], pos[2], min(self.page_images[pos[0][-1]].size[1] / ZM, pos[4] + GAP), min(self.page_images[pos[0][-1]].size[1] / ZM, pos[4] + 120))) + last_page_idx = pos[0][-1] + if not (0 <= last_page_idx < page_count): + logging.warning(f"Last page index {last_page_idx} out of range for {page_count} pages; skipping crop.") + if need_position: + return None, None + return + last_page_height = self.page_images[last_page_idx].size[1] / ZM + poss.append( + ( + [last_page_idx], + pos[1], + pos[2], + min(last_page_height, pos[4] + GAP), + min(last_page_height, pos[4] + 120), + ) + ) positions = [] for ii, (pns, left, right, top, bottom) in enumerate(poss): right = left + max_width bottom *= ZM for pn in pns[1:]: - bottom += self.page_images[pn - 1].size[1] + if 0 <= pn - 1 < page_count: + bottom += self.page_images[pn - 1].size[1] + else: + logging.warning(f"Page index {pn}-1 out of range for {page_count} pages during crop; skipping height accumulation.") + + if not (0 <= pns[0] < page_count): + logging.warning(f"Base page index {pns[0]} out of range for {page_count} pages during crop; skipping this segment.") + continue + imgs.append(self.page_images[pns[0]].crop((left * ZM, top * ZM, right * ZM, min(bottom, self.page_images[pns[0]].size[1])))) if 0 < ii < len(poss) - 1: positions.append((pns[0] + self.page_from, left, right, top, min(bottom, self.page_images[pns[0]].size[1]) / ZM)) bottom -= self.page_images[pns[0]].size[1] for pn in pns[1:]: + if not (0 <= pn < page_count): + logging.warning(f"Page index {pn} out of range for {page_count} pages during crop; skipping this page.") + continue imgs.append(self.page_images[pn].crop((left * ZM, 0, right * ZM, min(bottom, self.page_images[pn].size[1])))) if 0 < ii < len(poss) - 1: positions.append((pn + self.page_from, left, right, 0, min(bottom, self.page_images[pn].size[1]) / ZM))