diff --git a/deepdoc/README.md b/deepdoc/README.md index 9a5e44089..db70e30d8 100644 --- a/deepdoc/README.md +++ b/deepdoc/README.md @@ -103,6 +103,31 @@ We use vision information to resolve problems as human being.
+ + - **Table Auto-Rotation**. For scanned PDFs where tables may be incorrectly oriented (rotated 90°, 180°, or 270°), + the PDF parser automatically detects the best rotation angle using OCR confidence scores before performing + table structure recognition. This significantly improves OCR accuracy and table structure detection for rotated tables. + + The feature evaluates 4 rotation angles (0°, 90°, 180°, 270°) and selects the one with highest OCR confidence. + After determining the best orientation, it re-performs OCR on the correctly rotated table image. + + This feature is **enabled by default**. You can control it via environment variable: + ```bash + # Disable table auto-rotation + export TABLE_AUTO_ROTATE=false + + # Enable table auto-rotation (default) + export TABLE_AUTO_ROTATE=true + ``` + + Or via API parameter: + ```python + from deepdoc.parser import PdfParser + + parser = PdfParser() + # Disable auto-rotation for this call + boxes, tables = parser(pdf_path, auto_rotate_tables=False) + ``` ## 3. Parser diff --git a/deepdoc/README_zh.md b/deepdoc/README_zh.md index 4ada7edb2..3eb38e3dd 100644 --- a/deepdoc/README_zh.md +++ b/deepdoc/README_zh.md @@ -102,6 +102,30 @@ export HF_ENDPOINT=https://hf-mirror.com
+ + - **表格自动旋转(Table Auto-Rotation)**。对于扫描的 PDF 文档,表格可能存在方向错误(旋转了 90°、180° 或 270°), + PDF 解析器会在进行表格结构识别之前,自动使用 OCR 置信度来检测最佳旋转角度。这大大提高了旋转表格的 OCR 准确性和表格结构检测效果。 + + 该功能会评估 4 个旋转角度(0°、90°、180°、270°),并选择 OCR 置信度最高的角度。 + 确定最佳方向后,会对旋转后的表格图像重新进行 OCR 识别。 + + 此功能**默认启用**。您可以通过环境变量控制: + ```bash + # 禁用表格自动旋转 + export TABLE_AUTO_ROTATE=false + + # 启用表格自动旋转(默认) + export TABLE_AUTO_ROTATE=true + ``` + + 或通过 API 参数控制: + ```python + from deepdoc.parser import PdfParser + + parser = PdfParser() + # 禁用此次调用的自动旋转 + boxes, tables = parser(pdf_path, auto_rotate_tables=False) + ``` ## 3. 解析器 diff --git a/deepdoc/parser/pdf_parser.py b/deepdoc/parser/pdf_parser.py index 86e44468e..4a7a80976 100644 --- a/deepdoc/parser/pdf_parser.py +++ b/deepdoc/parser/pdf_parser.py @@ -92,6 +92,7 @@ class RAGFlowPdfParser: try: pip_install_torch() import torch.cuda + if torch.cuda.is_available(): self.updown_cnt_mdl.set_param({"device": "cuda"}) except Exception: @@ -196,13 +197,112 @@ class RAGFlowPdfParser: return False return True - def _table_transformer_job(self, ZM): + def _evaluate_table_orientation(self, table_img, sample_ratio=0.3): + """ + Evaluate the best rotation orientation for a table image. + + Tests 4 rotation angles (0°, 90°, 180°, 270°) and uses OCR + confidence scores to determine the best orientation. + + Args: + table_img: PIL Image object of the table region + sample_ratio: Sampling ratio for quick evaluation + + Returns: + tuple: (best_angle, best_img, confidence_scores) + - best_angle: Best rotation angle (0, 90, 180, 270) + - best_img: Image rotated to best orientation + - confidence_scores: Dict of scores for each angle + """ + + rotations = [ + (0, "original"), + (90, "rotate_90"), # clockwise 90° + (180, "rotate_180"), # 180° + (270, "rotate_270"), # clockwise 270° (counter-clockwise 90°) + ] + + results = {} + best_score = -1 + best_angle = 0 + best_img = table_img + + for angle, name in rotations: + # Rotate image + if angle == 0: + rotated_img = table_img + else: + # PIL's rotate is counter-clockwise, use negative angle for clockwise + rotated_img = table_img.rotate(-angle, expand=True) + + # Convert to numpy array for OCR + img_array = np.array(rotated_img) + + # Perform OCR detection and recognition + try: + ocr_results = self.ocr(img_array) + + if ocr_results: + # Calculate average confidence + scores = [conf for _, (_, conf) in ocr_results] + avg_score = sum(scores) / len(scores) if scores else 0 + total_regions = len(scores) + + # Combined score: considers both average confidence and number of regions + # More regions + higher confidence = better orientation + combined_score = avg_score * (1 + 0.1 * min(total_regions, 50) / 50) + else: + avg_score = 0 + total_regions = 0 + combined_score = 0 + + except Exception as e: + logging.warning(f"OCR failed for angle {angle}: {e}") + avg_score = 0 + total_regions = 0 + combined_score = 0 + + results[angle] = {"avg_confidence": avg_score, "total_regions": total_regions, "combined_score": combined_score} + + logging.debug(f"Table orientation {angle}°: avg_conf={avg_score:.4f}, regions={total_regions}, combined={combined_score:.4f}") + + if combined_score > best_score: + best_score = combined_score + best_angle = angle + best_img = rotated_img + + logging.info(f"Best table orientation: {best_angle}° (score={best_score:.4f})") + + return best_angle, best_img, results + + def _table_transformer_job(self, ZM, auto_rotate=True): + """ + Process table structure recognition. + + When auto_rotate=True, the complete workflow: + 1. Evaluate table orientation and select the best rotation angle + 2. Use rotated image for table structure recognition (TSR) + 3. Re-OCR the rotated image + 4. Match new OCR results with TSR cell coordinates + + Args: + ZM: Zoom factor + auto_rotate: Whether to enable auto orientation correction + """ logging.debug("Table processing...") imgs, pos = [], [] tbcnt = [0] MARGIN = 10 self.tb_cpns = [] + self.table_rotations = {} # Store rotation info for each table + self.rotated_table_imgs = {} # Store rotated table images + assert len(self.page_layout) == len(self.page_images) + + # Collect layout info for all tables + table_layouts = [] # [(page, table_layout, left, top, right, bott), ...] + + table_index = 0 for p, tbls in enumerate(self.page_layout): # for page tbls = [f for f in tbls if f["type"] == "table"] tbcnt.append(len(tbls)) @@ -214,29 +314,70 @@ class RAGFlowPdfParser: top *= ZM right *= ZM bott *= ZM - pos.append((left, top)) - imgs.append(self.page_images[p].crop((left, top, right, bott))) + pos.append((left, top, p, table_index)) # Add page and table_index + + # Record table layout info + table_layouts.append({"page": p, "table_index": table_index, "layout": tb, "coords": (left, top, right, bott)}) + + # Crop table image + table_img = self.page_images[p].crop((left, top, right, bott)) + + if auto_rotate: + # Evaluate table orientation + logging.debug(f"Evaluating orientation for table {table_index} on page {p}") + best_angle, rotated_img, rotation_scores = self._evaluate_table_orientation(table_img) + + # Store rotation info + self.table_rotations[table_index] = { + "page": p, + "original_pos": (left, top, right, bott), + "best_angle": best_angle, + "scores": rotation_scores, + "rotated_size": rotated_img.size, # (width, height) + } + + # Store the rotated image + self.rotated_table_imgs[table_index] = rotated_img + imgs.append(rotated_img) + + if best_angle != 0: + logging.info(f"Table {table_index} on page {p}: rotated {best_angle}° for better recognition") + else: + imgs.append(table_img) + self.table_rotations[table_index] = {"page": p, "original_pos": (left, top, right, bott), "best_angle": 0, "scores": {}, "rotated_size": table_img.size} + self.rotated_table_imgs[table_index] = table_img + + table_index += 1 assert len(self.page_images) == len(tbcnt) - 1 if not imgs: return + + # Perform table structure recognition (TSR) recos = self.tbl_det(imgs) + + # If tables were rotated, re-OCR the rotated images and replace table boxes + if auto_rotate: + self._ocr_rotated_tables(ZM, table_layouts, recos, tbcnt) + + # Process TSR results (keep original logic but handle rotated coordinates) tbcnt = np.cumsum(tbcnt) for i in range(len(tbcnt) - 1): # for page pg = [] for j, tb_items in enumerate(recos[tbcnt[i] : tbcnt[i + 1]]): # for table poss = pos[tbcnt[i] : tbcnt[i + 1]] for it in tb_items: # for table components - it["x0"] = it["x0"] + poss[j][0] - it["x1"] = it["x1"] + poss[j][0] - it["top"] = it["top"] + poss[j][1] - it["bottom"] = it["bottom"] + poss[j][1] - for n in ["x0", "x1", "top", "bottom"]: - it[n] /= ZM - it["top"] += self.page_cum_height[i] - it["bottom"] += self.page_cum_height[i] - it["pn"] = i + # TSR coordinates are relative to rotated image, need to record + it["x0_rotated"] = it["x0"] + it["x1_rotated"] = it["x1"] + it["top_rotated"] = it["top"] + it["bottom_rotated"] = it["bottom"] + + # For rotated tables, coordinate transformation to page space requires rotation + # Since we already re-OCR'd on rotated image, keep simple processing here + it["pn"] = poss[j][2] # page number it["layoutno"] = j + it["table_index"] = poss[j][3] # table index pg.append(it) self.tb_cpns.extend(pg) @@ -249,8 +390,9 @@ class RAGFlowPdfParser: headers = gather(r".*header$") rows = gather(r".* (row|header)") spans = gather(r".*spanning") - clmns = sorted([r for r in self.tb_cpns if re.match(r"table column$", r["label"])], key=lambda x: (x["pn"], x["layoutno"], x["x0"])) + clmns = sorted([r for r in self.tb_cpns if re.match(r"table column$", r["label"])], key=lambda x: (x["pn"], x["layoutno"], x["x0_rotated"] if "x0_rotated" in x else x["x0"])) clmns = Recognizer.layouts_cleanup(self.boxes, clmns, 5, 0.5) + for b in self.boxes: if b.get("layout_type", "") != "table": continue @@ -282,6 +424,109 @@ class RAGFlowPdfParser: b["H_right"] = spans[ii]["x1"] b["SP"] = ii + def _ocr_rotated_tables(self, ZM, table_layouts, tsr_results, tbcnt): + """ + Re-OCR rotated table images and update self.boxes. + + Args: + ZM: Zoom factor + table_layouts: List of table layout info + tsr_results: TSR recognition results + tbcnt: Cumulative table count per page + """ + tbcnt = np.cumsum(tbcnt) + + for tbl_info in table_layouts: + table_index = tbl_info["table_index"] + page = tbl_info["page"] + layout = tbl_info["layout"] + left, top, right, bott = tbl_info["coords"] + + rotation_info = self.table_rotations.get(table_index, {}) + best_angle = rotation_info.get("best_angle", 0) + + # Get the rotated table image + rotated_img = self.rotated_table_imgs.get(table_index) + if rotated_img is None: + continue + + # If table was rotated, re-OCR the rotated image + if best_angle != 0: + logging.info(f"Re-OCR table {table_index} on page {page} with rotation {best_angle}°") + + # Perform OCR on rotated image + img_array = np.array(rotated_img) + ocr_results = self.ocr(img_array) + + if not ocr_results: + logging.warning(f"No OCR results for rotated table {table_index}") + continue + + # Remove original text boxes from this table region in self.boxes + # Table region is defined by layout's x0, top, x1, bottom + table_x0 = layout["x0"] + table_top = layout["top"] + table_x1 = layout["x1"] + table_bottom = layout["bottom"] + + # Filter out original boxes within the table region + original_box_count = len(self.boxes) + self.boxes = [ + b + for b in self.boxes + if not ( + b.get("page_number") == page + self.page_from + and b.get("layout_type") == "table" + and b["x0"] >= table_x0 - 5 + and b["x1"] <= table_x1 + 5 + and b["top"] >= table_top - 5 + and b["bottom"] <= table_bottom + 5 + ) + ] + removed_count = original_box_count - len(self.boxes) + logging.debug(f"Removed {removed_count} original boxes from table {table_index}") + + # Add new OCR results to self.boxes + # OCR coordinates are relative to rotated image, need to preserve + rotated_width, rotated_height = rotated_img.size + + for bbox, (text, conf) in ocr_results: + if conf < 0.5: # Filter low confidence results + continue + + # bbox format: [[x1,y1], [x2,y2], [x3,y3], [x4,y4]] + x_coords = [p[0] for p in bbox] + y_coords = [p[1] for p in bbox] + + # Coordinates in rotated image + box_x0 = min(x_coords) / ZM + box_x1 = max(x_coords) / ZM + box_top = min(y_coords) / ZM + box_bottom = max(y_coords) / ZM + + # Create new box, mark as from rotated table + new_box = { + "text": text, + "x0": box_x0 + table_x0, # Coordinates relative to page + "x1": box_x1 + table_x0, + "top": box_top + table_top + self.page_cum_height[page], + "bottom": box_bottom + table_top + self.page_cum_height[page], + "page_number": page + self.page_from, + "layout_type": "table", + "layoutno": f"table-{table_index}", + "_rotated": True, + "_rotation_angle": best_angle, + "_table_index": table_index, + # Save original coordinates in rotated image for table reconstruction + "_rotated_x0": box_x0, + "_rotated_x1": box_x1, + "_rotated_top": box_top, + "_rotated_bottom": box_bottom, + } + self.boxes.append(new_box) + + logging.info(f"Added {len(ocr_results)} OCR results from rotated table {table_index}") + def __ocr(self, pagenum, img, chars, ZM=3, device_id: int | None = None): start = timer() bxs = self.ocr.detect(np.array(img), device_id) @@ -412,11 +657,9 @@ class RAGFlowPdfParser: page_cols[pg] = best_k logging.info(f"[Page {pg}] best_score={best_score:.2f}, best_k={best_k}") - global_cols = Counter(page_cols.values()).most_common(1)[0][0] logging.info(f"Global column_num decided by majority: {global_cols}") - for pg, bxs in by_page.items(): if not bxs: continue @@ -1184,10 +1427,26 @@ class RAGFlowPdfParser: if len(self.boxes) == 0 and zoomin < 9: self.__images__(fnm, zoomin * 3, page_from, page_to, callback) - def __call__(self, fnm, need_image=True, zoomin=3, return_html=False): + def __call__(self, fnm, need_image=True, zoomin=3, return_html=False, auto_rotate_tables=None): + """ + Parse a PDF file. + + Args: + fnm: PDF file path or binary content + need_image: Whether to extract images + zoomin: Zoom factor + return_html: Whether to return tables in HTML format + auto_rotate_tables: Whether to enable auto orientation correction for tables. + None: Use TABLE_AUTO_ROTATE env var setting (default: True) + True: Enable auto orientation correction + False: Disable auto orientation correction + """ + if auto_rotate_tables is None: + auto_rotate_tables = os.getenv("TABLE_AUTO_ROTATE", "true").lower() in ("true", "1", "yes") + self.__images__(fnm, zoomin) self._layouts_rec(zoomin) - self._table_transformer_job(zoomin) + self._table_transformer_job(zoomin, auto_rotate=auto_rotate_tables) self._text_merge() self._concat_downward() self._filter_forpages() @@ -1205,8 +1464,11 @@ class RAGFlowPdfParser: if callback: callback(0.63, "Layout analysis ({:.2f}s)".format(timer() - start)) + # Read table auto-rotation setting from environment variable + auto_rotate_tables = os.getenv("TABLE_AUTO_ROTATE", "true").lower() in ("true", "1", "yes") + start = timer() - self._table_transformer_job(zoomin) + self._table_transformer_job(zoomin, auto_rotate=auto_rotate_tables) if callback: callback(0.83, "Table analysis ({:.2f}s)".format(timer() - start)) @@ -1498,10 +1760,7 @@ class VisionParser(RAGFlowPdfParser): if text: width, height = self.page_images[idx].size - all_docs.append(( - text, - f"@@{pdf_page_num + 1}\t{0.0:.1f}\t{width / zoomin:.1f}\t{0.0:.1f}\t{height / zoomin:.1f}##" - )) + all_docs.append((text, f"@@{pdf_page_num + 1}\t{0.0:.1f}\t{width / zoomin:.1f}\t{0.0:.1f}\t{height / zoomin:.1f}##")) return all_docs, []