mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-01-23 11:36:38 +08:00
[Feat]Automatic table orientation detection and correction (#12719)
### What problem does this PR solve? This PR introduces automatic table orientation detection and correction within the PDF parser. This ensures that tables in PDFs are correctly oriented before structure recognition, improving overall parsing accuracy. ### Type of change - [x] New Feature (non-breaking change which adds functionality) - [x] Documentation Update
This commit is contained in:
@ -103,6 +103,31 @@ We use vision information to resolve problems as human being.
|
||||
<div align="center" style="margin-top:20px;margin-bottom:20px;">
|
||||
<img src="https://github.com/infiniflow/ragflow/assets/12318111/cb24e81b-f2ba-49f3-ac09-883d75606f4c" width="1000"/>
|
||||
</div>
|
||||
|
||||
- **Table Auto-Rotation**. For scanned PDFs where tables may be incorrectly oriented (rotated 90°, 180°, or 270°),
|
||||
the PDF parser automatically detects the best rotation angle using OCR confidence scores before performing
|
||||
table structure recognition. This significantly improves OCR accuracy and table structure detection for rotated tables.
|
||||
|
||||
The feature evaluates 4 rotation angles (0°, 90°, 180°, 270°) and selects the one with highest OCR confidence.
|
||||
After determining the best orientation, it re-performs OCR on the correctly rotated table image.
|
||||
|
||||
This feature is **enabled by default**. You can control it via environment variable:
|
||||
```bash
|
||||
# Disable table auto-rotation
|
||||
export TABLE_AUTO_ROTATE=false
|
||||
|
||||
# Enable table auto-rotation (default)
|
||||
export TABLE_AUTO_ROTATE=true
|
||||
```
|
||||
|
||||
Or via API parameter:
|
||||
```python
|
||||
from deepdoc.parser import PdfParser
|
||||
|
||||
parser = PdfParser()
|
||||
# Disable auto-rotation for this call
|
||||
boxes, tables = parser(pdf_path, auto_rotate_tables=False)
|
||||
```
|
||||
|
||||
<a name="3"></a>
|
||||
## 3. Parser
|
||||
|
||||
@ -102,6 +102,30 @@ export HF_ENDPOINT=https://hf-mirror.com
|
||||
<div align="center" style="margin-top:20px;margin-bottom:20px;">
|
||||
<img src="https://github.com/infiniflow/ragflow/assets/12318111/cb24e81b-f2ba-49f3-ac09-883d75606f4c" width="1000"/>
|
||||
</div>
|
||||
|
||||
- **表格自动旋转(Table Auto-Rotation)**。对于扫描的 PDF 文档,表格可能存在方向错误(旋转了 90°、180° 或 270°),
|
||||
PDF 解析器会在进行表格结构识别之前,自动使用 OCR 置信度来检测最佳旋转角度。这大大提高了旋转表格的 OCR 准确性和表格结构检测效果。
|
||||
|
||||
该功能会评估 4 个旋转角度(0°、90°、180°、270°),并选择 OCR 置信度最高的角度。
|
||||
确定最佳方向后,会对旋转后的表格图像重新进行 OCR 识别。
|
||||
|
||||
此功能**默认启用**。您可以通过环境变量控制:
|
||||
```bash
|
||||
# 禁用表格自动旋转
|
||||
export TABLE_AUTO_ROTATE=false
|
||||
|
||||
# 启用表格自动旋转(默认)
|
||||
export TABLE_AUTO_ROTATE=true
|
||||
```
|
||||
|
||||
或通过 API 参数控制:
|
||||
```python
|
||||
from deepdoc.parser import PdfParser
|
||||
|
||||
parser = PdfParser()
|
||||
# 禁用此次调用的自动旋转
|
||||
boxes, tables = parser(pdf_path, auto_rotate_tables=False)
|
||||
```
|
||||
|
||||
<a name="3"></a>
|
||||
## 3. 解析器
|
||||
|
||||
@ -92,6 +92,7 @@ class RAGFlowPdfParser:
|
||||
try:
|
||||
pip_install_torch()
|
||||
import torch.cuda
|
||||
|
||||
if torch.cuda.is_available():
|
||||
self.updown_cnt_mdl.set_param({"device": "cuda"})
|
||||
except Exception:
|
||||
@ -196,13 +197,112 @@ class RAGFlowPdfParser:
|
||||
return False
|
||||
return True
|
||||
|
||||
def _table_transformer_job(self, ZM):
|
||||
def _evaluate_table_orientation(self, table_img, sample_ratio=0.3):
|
||||
"""
|
||||
Evaluate the best rotation orientation for a table image.
|
||||
|
||||
Tests 4 rotation angles (0°, 90°, 180°, 270°) and uses OCR
|
||||
confidence scores to determine the best orientation.
|
||||
|
||||
Args:
|
||||
table_img: PIL Image object of the table region
|
||||
sample_ratio: Sampling ratio for quick evaluation
|
||||
|
||||
Returns:
|
||||
tuple: (best_angle, best_img, confidence_scores)
|
||||
- best_angle: Best rotation angle (0, 90, 180, 270)
|
||||
- best_img: Image rotated to best orientation
|
||||
- confidence_scores: Dict of scores for each angle
|
||||
"""
|
||||
|
||||
rotations = [
|
||||
(0, "original"),
|
||||
(90, "rotate_90"), # clockwise 90°
|
||||
(180, "rotate_180"), # 180°
|
||||
(270, "rotate_270"), # clockwise 270° (counter-clockwise 90°)
|
||||
]
|
||||
|
||||
results = {}
|
||||
best_score = -1
|
||||
best_angle = 0
|
||||
best_img = table_img
|
||||
|
||||
for angle, name in rotations:
|
||||
# Rotate image
|
||||
if angle == 0:
|
||||
rotated_img = table_img
|
||||
else:
|
||||
# PIL's rotate is counter-clockwise, use negative angle for clockwise
|
||||
rotated_img = table_img.rotate(-angle, expand=True)
|
||||
|
||||
# Convert to numpy array for OCR
|
||||
img_array = np.array(rotated_img)
|
||||
|
||||
# Perform OCR detection and recognition
|
||||
try:
|
||||
ocr_results = self.ocr(img_array)
|
||||
|
||||
if ocr_results:
|
||||
# Calculate average confidence
|
||||
scores = [conf for _, (_, conf) in ocr_results]
|
||||
avg_score = sum(scores) / len(scores) if scores else 0
|
||||
total_regions = len(scores)
|
||||
|
||||
# Combined score: considers both average confidence and number of regions
|
||||
# More regions + higher confidence = better orientation
|
||||
combined_score = avg_score * (1 + 0.1 * min(total_regions, 50) / 50)
|
||||
else:
|
||||
avg_score = 0
|
||||
total_regions = 0
|
||||
combined_score = 0
|
||||
|
||||
except Exception as e:
|
||||
logging.warning(f"OCR failed for angle {angle}: {e}")
|
||||
avg_score = 0
|
||||
total_regions = 0
|
||||
combined_score = 0
|
||||
|
||||
results[angle] = {"avg_confidence": avg_score, "total_regions": total_regions, "combined_score": combined_score}
|
||||
|
||||
logging.debug(f"Table orientation {angle}°: avg_conf={avg_score:.4f}, regions={total_regions}, combined={combined_score:.4f}")
|
||||
|
||||
if combined_score > best_score:
|
||||
best_score = combined_score
|
||||
best_angle = angle
|
||||
best_img = rotated_img
|
||||
|
||||
logging.info(f"Best table orientation: {best_angle}° (score={best_score:.4f})")
|
||||
|
||||
return best_angle, best_img, results
|
||||
|
||||
def _table_transformer_job(self, ZM, auto_rotate=True):
|
||||
"""
|
||||
Process table structure recognition.
|
||||
|
||||
When auto_rotate=True, the complete workflow:
|
||||
1. Evaluate table orientation and select the best rotation angle
|
||||
2. Use rotated image for table structure recognition (TSR)
|
||||
3. Re-OCR the rotated image
|
||||
4. Match new OCR results with TSR cell coordinates
|
||||
|
||||
Args:
|
||||
ZM: Zoom factor
|
||||
auto_rotate: Whether to enable auto orientation correction
|
||||
"""
|
||||
logging.debug("Table processing...")
|
||||
imgs, pos = [], []
|
||||
tbcnt = [0]
|
||||
MARGIN = 10
|
||||
self.tb_cpns = []
|
||||
self.table_rotations = {} # Store rotation info for each table
|
||||
self.rotated_table_imgs = {} # Store rotated table images
|
||||
|
||||
assert len(self.page_layout) == len(self.page_images)
|
||||
|
||||
# Collect layout info for all tables
|
||||
table_layouts = [] # [(page, table_layout, left, top, right, bott), ...]
|
||||
|
||||
table_index = 0
|
||||
for p, tbls in enumerate(self.page_layout): # for page
|
||||
tbls = [f for f in tbls if f["type"] == "table"]
|
||||
tbcnt.append(len(tbls))
|
||||
@ -214,29 +314,70 @@ class RAGFlowPdfParser:
|
||||
top *= ZM
|
||||
right *= ZM
|
||||
bott *= ZM
|
||||
pos.append((left, top))
|
||||
imgs.append(self.page_images[p].crop((left, top, right, bott)))
|
||||
pos.append((left, top, p, table_index)) # Add page and table_index
|
||||
|
||||
# Record table layout info
|
||||
table_layouts.append({"page": p, "table_index": table_index, "layout": tb, "coords": (left, top, right, bott)})
|
||||
|
||||
# Crop table image
|
||||
table_img = self.page_images[p].crop((left, top, right, bott))
|
||||
|
||||
if auto_rotate:
|
||||
# Evaluate table orientation
|
||||
logging.debug(f"Evaluating orientation for table {table_index} on page {p}")
|
||||
best_angle, rotated_img, rotation_scores = self._evaluate_table_orientation(table_img)
|
||||
|
||||
# Store rotation info
|
||||
self.table_rotations[table_index] = {
|
||||
"page": p,
|
||||
"original_pos": (left, top, right, bott),
|
||||
"best_angle": best_angle,
|
||||
"scores": rotation_scores,
|
||||
"rotated_size": rotated_img.size, # (width, height)
|
||||
}
|
||||
|
||||
# Store the rotated image
|
||||
self.rotated_table_imgs[table_index] = rotated_img
|
||||
imgs.append(rotated_img)
|
||||
|
||||
if best_angle != 0:
|
||||
logging.info(f"Table {table_index} on page {p}: rotated {best_angle}° for better recognition")
|
||||
else:
|
||||
imgs.append(table_img)
|
||||
self.table_rotations[table_index] = {"page": p, "original_pos": (left, top, right, bott), "best_angle": 0, "scores": {}, "rotated_size": table_img.size}
|
||||
self.rotated_table_imgs[table_index] = table_img
|
||||
|
||||
table_index += 1
|
||||
|
||||
assert len(self.page_images) == len(tbcnt) - 1
|
||||
if not imgs:
|
||||
return
|
||||
|
||||
# Perform table structure recognition (TSR)
|
||||
recos = self.tbl_det(imgs)
|
||||
|
||||
# If tables were rotated, re-OCR the rotated images and replace table boxes
|
||||
if auto_rotate:
|
||||
self._ocr_rotated_tables(ZM, table_layouts, recos, tbcnt)
|
||||
|
||||
# Process TSR results (keep original logic but handle rotated coordinates)
|
||||
tbcnt = np.cumsum(tbcnt)
|
||||
for i in range(len(tbcnt) - 1): # for page
|
||||
pg = []
|
||||
for j, tb_items in enumerate(recos[tbcnt[i] : tbcnt[i + 1]]): # for table
|
||||
poss = pos[tbcnt[i] : tbcnt[i + 1]]
|
||||
for it in tb_items: # for table components
|
||||
it["x0"] = it["x0"] + poss[j][0]
|
||||
it["x1"] = it["x1"] + poss[j][0]
|
||||
it["top"] = it["top"] + poss[j][1]
|
||||
it["bottom"] = it["bottom"] + poss[j][1]
|
||||
for n in ["x0", "x1", "top", "bottom"]:
|
||||
it[n] /= ZM
|
||||
it["top"] += self.page_cum_height[i]
|
||||
it["bottom"] += self.page_cum_height[i]
|
||||
it["pn"] = i
|
||||
# TSR coordinates are relative to rotated image, need to record
|
||||
it["x0_rotated"] = it["x0"]
|
||||
it["x1_rotated"] = it["x1"]
|
||||
it["top_rotated"] = it["top"]
|
||||
it["bottom_rotated"] = it["bottom"]
|
||||
|
||||
# For rotated tables, coordinate transformation to page space requires rotation
|
||||
# Since we already re-OCR'd on rotated image, keep simple processing here
|
||||
it["pn"] = poss[j][2] # page number
|
||||
it["layoutno"] = j
|
||||
it["table_index"] = poss[j][3] # table index
|
||||
pg.append(it)
|
||||
self.tb_cpns.extend(pg)
|
||||
|
||||
@ -249,8 +390,9 @@ class RAGFlowPdfParser:
|
||||
headers = gather(r".*header$")
|
||||
rows = gather(r".* (row|header)")
|
||||
spans = gather(r".*spanning")
|
||||
clmns = sorted([r for r in self.tb_cpns if re.match(r"table column$", r["label"])], key=lambda x: (x["pn"], x["layoutno"], x["x0"]))
|
||||
clmns = sorted([r for r in self.tb_cpns if re.match(r"table column$", r["label"])], key=lambda x: (x["pn"], x["layoutno"], x["x0_rotated"] if "x0_rotated" in x else x["x0"]))
|
||||
clmns = Recognizer.layouts_cleanup(self.boxes, clmns, 5, 0.5)
|
||||
|
||||
for b in self.boxes:
|
||||
if b.get("layout_type", "") != "table":
|
||||
continue
|
||||
@ -282,6 +424,109 @@ class RAGFlowPdfParser:
|
||||
b["H_right"] = spans[ii]["x1"]
|
||||
b["SP"] = ii
|
||||
|
||||
def _ocr_rotated_tables(self, ZM, table_layouts, tsr_results, tbcnt):
|
||||
"""
|
||||
Re-OCR rotated table images and update self.boxes.
|
||||
|
||||
Args:
|
||||
ZM: Zoom factor
|
||||
table_layouts: List of table layout info
|
||||
tsr_results: TSR recognition results
|
||||
tbcnt: Cumulative table count per page
|
||||
"""
|
||||
tbcnt = np.cumsum(tbcnt)
|
||||
|
||||
for tbl_info in table_layouts:
|
||||
table_index = tbl_info["table_index"]
|
||||
page = tbl_info["page"]
|
||||
layout = tbl_info["layout"]
|
||||
left, top, right, bott = tbl_info["coords"]
|
||||
|
||||
rotation_info = self.table_rotations.get(table_index, {})
|
||||
best_angle = rotation_info.get("best_angle", 0)
|
||||
|
||||
# Get the rotated table image
|
||||
rotated_img = self.rotated_table_imgs.get(table_index)
|
||||
if rotated_img is None:
|
||||
continue
|
||||
|
||||
# If table was rotated, re-OCR the rotated image
|
||||
if best_angle != 0:
|
||||
logging.info(f"Re-OCR table {table_index} on page {page} with rotation {best_angle}°")
|
||||
|
||||
# Perform OCR on rotated image
|
||||
img_array = np.array(rotated_img)
|
||||
ocr_results = self.ocr(img_array)
|
||||
|
||||
if not ocr_results:
|
||||
logging.warning(f"No OCR results for rotated table {table_index}")
|
||||
continue
|
||||
|
||||
# Remove original text boxes from this table region in self.boxes
|
||||
# Table region is defined by layout's x0, top, x1, bottom
|
||||
table_x0 = layout["x0"]
|
||||
table_top = layout["top"]
|
||||
table_x1 = layout["x1"]
|
||||
table_bottom = layout["bottom"]
|
||||
|
||||
# Filter out original boxes within the table region
|
||||
original_box_count = len(self.boxes)
|
||||
self.boxes = [
|
||||
b
|
||||
for b in self.boxes
|
||||
if not (
|
||||
b.get("page_number") == page + self.page_from
|
||||
and b.get("layout_type") == "table"
|
||||
and b["x0"] >= table_x0 - 5
|
||||
and b["x1"] <= table_x1 + 5
|
||||
and b["top"] >= table_top - 5
|
||||
and b["bottom"] <= table_bottom + 5
|
||||
)
|
||||
]
|
||||
removed_count = original_box_count - len(self.boxes)
|
||||
logging.debug(f"Removed {removed_count} original boxes from table {table_index}")
|
||||
|
||||
# Add new OCR results to self.boxes
|
||||
# OCR coordinates are relative to rotated image, need to preserve
|
||||
rotated_width, rotated_height = rotated_img.size
|
||||
|
||||
for bbox, (text, conf) in ocr_results:
|
||||
if conf < 0.5: # Filter low confidence results
|
||||
continue
|
||||
|
||||
# bbox format: [[x1,y1], [x2,y2], [x3,y3], [x4,y4]]
|
||||
x_coords = [p[0] for p in bbox]
|
||||
y_coords = [p[1] for p in bbox]
|
||||
|
||||
# Coordinates in rotated image
|
||||
box_x0 = min(x_coords) / ZM
|
||||
box_x1 = max(x_coords) / ZM
|
||||
box_top = min(y_coords) / ZM
|
||||
box_bottom = max(y_coords) / ZM
|
||||
|
||||
# Create new box, mark as from rotated table
|
||||
new_box = {
|
||||
"text": text,
|
||||
"x0": box_x0 + table_x0, # Coordinates relative to page
|
||||
"x1": box_x1 + table_x0,
|
||||
"top": box_top + table_top + self.page_cum_height[page],
|
||||
"bottom": box_bottom + table_top + self.page_cum_height[page],
|
||||
"page_number": page + self.page_from,
|
||||
"layout_type": "table",
|
||||
"layoutno": f"table-{table_index}",
|
||||
"_rotated": True,
|
||||
"_rotation_angle": best_angle,
|
||||
"_table_index": table_index,
|
||||
# Save original coordinates in rotated image for table reconstruction
|
||||
"_rotated_x0": box_x0,
|
||||
"_rotated_x1": box_x1,
|
||||
"_rotated_top": box_top,
|
||||
"_rotated_bottom": box_bottom,
|
||||
}
|
||||
self.boxes.append(new_box)
|
||||
|
||||
logging.info(f"Added {len(ocr_results)} OCR results from rotated table {table_index}")
|
||||
|
||||
def __ocr(self, pagenum, img, chars, ZM=3, device_id: int | None = None):
|
||||
start = timer()
|
||||
bxs = self.ocr.detect(np.array(img), device_id)
|
||||
@ -412,11 +657,9 @@ class RAGFlowPdfParser:
|
||||
page_cols[pg] = best_k
|
||||
logging.info(f"[Page {pg}] best_score={best_score:.2f}, best_k={best_k}")
|
||||
|
||||
|
||||
global_cols = Counter(page_cols.values()).most_common(1)[0][0]
|
||||
logging.info(f"Global column_num decided by majority: {global_cols}")
|
||||
|
||||
|
||||
for pg, bxs in by_page.items():
|
||||
if not bxs:
|
||||
continue
|
||||
@ -1184,10 +1427,26 @@ class RAGFlowPdfParser:
|
||||
if len(self.boxes) == 0 and zoomin < 9:
|
||||
self.__images__(fnm, zoomin * 3, page_from, page_to, callback)
|
||||
|
||||
def __call__(self, fnm, need_image=True, zoomin=3, return_html=False):
|
||||
def __call__(self, fnm, need_image=True, zoomin=3, return_html=False, auto_rotate_tables=None):
|
||||
"""
|
||||
Parse a PDF file.
|
||||
|
||||
Args:
|
||||
fnm: PDF file path or binary content
|
||||
need_image: Whether to extract images
|
||||
zoomin: Zoom factor
|
||||
return_html: Whether to return tables in HTML format
|
||||
auto_rotate_tables: Whether to enable auto orientation correction for tables.
|
||||
None: Use TABLE_AUTO_ROTATE env var setting (default: True)
|
||||
True: Enable auto orientation correction
|
||||
False: Disable auto orientation correction
|
||||
"""
|
||||
if auto_rotate_tables is None:
|
||||
auto_rotate_tables = os.getenv("TABLE_AUTO_ROTATE", "true").lower() in ("true", "1", "yes")
|
||||
|
||||
self.__images__(fnm, zoomin)
|
||||
self._layouts_rec(zoomin)
|
||||
self._table_transformer_job(zoomin)
|
||||
self._table_transformer_job(zoomin, auto_rotate=auto_rotate_tables)
|
||||
self._text_merge()
|
||||
self._concat_downward()
|
||||
self._filter_forpages()
|
||||
@ -1205,8 +1464,11 @@ class RAGFlowPdfParser:
|
||||
if callback:
|
||||
callback(0.63, "Layout analysis ({:.2f}s)".format(timer() - start))
|
||||
|
||||
# Read table auto-rotation setting from environment variable
|
||||
auto_rotate_tables = os.getenv("TABLE_AUTO_ROTATE", "true").lower() in ("true", "1", "yes")
|
||||
|
||||
start = timer()
|
||||
self._table_transformer_job(zoomin)
|
||||
self._table_transformer_job(zoomin, auto_rotate=auto_rotate_tables)
|
||||
if callback:
|
||||
callback(0.83, "Table analysis ({:.2f}s)".format(timer() - start))
|
||||
|
||||
@ -1498,10 +1760,7 @@ class VisionParser(RAGFlowPdfParser):
|
||||
|
||||
if text:
|
||||
width, height = self.page_images[idx].size
|
||||
all_docs.append((
|
||||
text,
|
||||
f"@@{pdf_page_num + 1}\t{0.0:.1f}\t{width / zoomin:.1f}\t{0.0:.1f}\t{height / zoomin:.1f}##"
|
||||
))
|
||||
all_docs.append((text, f"@@{pdf_page_num + 1}\t{0.0:.1f}\t{width / zoomin:.1f}\t{0.0:.1f}\t{height / zoomin:.1f}##"))
|
||||
return all_docs, []
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user