mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-02-04 17:45:07 +08:00
Fix: pdf chunking / table rotation (#12981)
### What problem does this PR solve? Fix: PDF chunking issue for single-page documents Refactor: Change the default refresh frequency to 5 Fix: Add a 0-degree threshold; require other rotation angles to exceed it by at least 0.2 Fix: Put connector name tips to correct place Fix: incorrect example response in delete datasets. ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue) - [x] Refactoring
This commit is contained in:
@ -52,7 +52,7 @@ async def set_connector():
|
||||
"source": req["source"],
|
||||
"input_type": InputType.POLL,
|
||||
"config": req["config"],
|
||||
"refresh_freq": int(req.get("refresh_freq", 30)),
|
||||
"refresh_freq": int(req.get("refresh_freq", 5)),
|
||||
"prune_freq": int(req.get("prune_freq", 720)),
|
||||
"timeout_secs": int(req.get("timeout_secs", 60 * 29)),
|
||||
"status": TaskStatus.SCHEDULE,
|
||||
|
||||
@ -226,6 +226,7 @@ class RAGFlowPdfParser:
|
||||
best_score = -1
|
||||
best_angle = 0
|
||||
best_img = table_img
|
||||
score_0 = None
|
||||
|
||||
for angle, name in rotations:
|
||||
# Rotate image
|
||||
@ -263,6 +264,8 @@ class RAGFlowPdfParser:
|
||||
combined_score = 0
|
||||
|
||||
results[angle] = {"avg_confidence": avg_score, "total_regions": total_regions, "combined_score": combined_score}
|
||||
if angle == 0:
|
||||
score_0 = combined_score
|
||||
|
||||
logging.debug(f"Table orientation {angle}°: avg_conf={avg_score:.4f}, regions={total_regions}, combined={combined_score:.4f}")
|
||||
|
||||
@ -271,6 +274,16 @@ class RAGFlowPdfParser:
|
||||
best_angle = angle
|
||||
best_img = rotated_img
|
||||
|
||||
# Absolute threshold rule:
|
||||
# Only choose non-0° if it exceeds 0° by more than 0.2 and 0° score is below 0.8.
|
||||
if best_angle != 0 and score_0 is not None:
|
||||
if not (best_score - score_0 > 0.2 and score_0 < 0.8):
|
||||
best_angle = 0
|
||||
best_img = table_img
|
||||
best_score = score_0
|
||||
|
||||
results[best_angle] = results.get(best_angle, {"avg_confidence": 0, "total_regions": 0, "combined_score": 0})
|
||||
|
||||
logging.info(f"Best table orientation: {best_angle}° (score={best_score:.4f})")
|
||||
|
||||
return best_angle, best_img, results
|
||||
@ -340,8 +353,6 @@ class RAGFlowPdfParser:
|
||||
self.rotated_table_imgs[table_index] = rotated_img
|
||||
imgs.append(rotated_img)
|
||||
|
||||
if best_angle != 0:
|
||||
logging.info(f"Table {table_index} on page {p}: rotated {best_angle}° for better recognition")
|
||||
else:
|
||||
imgs.append(table_img)
|
||||
self.table_rotations[table_index] = {"page": p, "original_pos": (left, top, right, bott), "best_angle": 0, "scores": {}, "rotated_size": table_img.size}
|
||||
@ -436,6 +447,90 @@ class RAGFlowPdfParser:
|
||||
"""
|
||||
tbcnt = np.cumsum(tbcnt)
|
||||
|
||||
def _table_region(layout, page_index):
|
||||
table_x0 = layout["x0"]
|
||||
table_top = layout["top"]
|
||||
table_x1 = layout["x1"]
|
||||
table_bottom = layout["bottom"]
|
||||
table_top_cum = table_top + self.page_cum_height[page_index]
|
||||
table_bottom_cum = table_bottom + self.page_cum_height[page_index]
|
||||
return table_x0, table_top, table_x1, table_bottom, table_top_cum, table_bottom_cum
|
||||
|
||||
def _collect_table_boxes(page_index, table_x0, table_x1, table_top_cum, table_bottom_cum):
|
||||
indices = [
|
||||
i
|
||||
for i, b in enumerate(self.boxes)
|
||||
if (
|
||||
b.get("page_number") == page_index + self.page_from
|
||||
and b.get("layout_type") == "table"
|
||||
and b["x0"] >= table_x0 - 5
|
||||
and b["x1"] <= table_x1 + 5
|
||||
and b["top"] >= table_top_cum - 5
|
||||
and b["bottom"] <= table_bottom_cum + 5
|
||||
)
|
||||
]
|
||||
original_boxes = [self.boxes[i] for i in indices]
|
||||
insert_at = indices[0] if indices else len(self.boxes)
|
||||
for i in reversed(indices):
|
||||
self.boxes.pop(i)
|
||||
return original_boxes, insert_at
|
||||
|
||||
def _restore_boxes(original_boxes, insert_at):
|
||||
for b in original_boxes:
|
||||
self.boxes.insert(insert_at, b)
|
||||
insert_at += 1
|
||||
return insert_at
|
||||
|
||||
def _map_rotated_point(x, y, angle, width, height):
|
||||
# Map a point from rotated image coords back to original image coords.
|
||||
if angle == 0:
|
||||
return x, y
|
||||
if angle == 90:
|
||||
# clockwise 90: original->rotated (x', y') = (y, width - x)
|
||||
# inverse:
|
||||
return width - y, x
|
||||
if angle == 180:
|
||||
return width - x, height - y
|
||||
if angle == 270:
|
||||
# clockwise 270: original->rotated (x', y') = (height - y, x)
|
||||
# inverse:
|
||||
return y, height - x
|
||||
return x, y
|
||||
|
||||
def _insert_ocr_boxes(ocr_results, page_index, table_x0, table_top, insert_at, table_index, best_angle, table_w_px, table_h_px):
|
||||
added = 0
|
||||
for bbox, (text, conf) in ocr_results:
|
||||
if conf < 0.5:
|
||||
continue
|
||||
mapped = [_map_rotated_point(p[0], p[1], best_angle, table_w_px, table_h_px) for p in bbox]
|
||||
x_coords = [p[0] for p in mapped]
|
||||
y_coords = [p[1] for p in mapped]
|
||||
box_x0 = min(x_coords) / ZM
|
||||
box_x1 = max(x_coords) / ZM
|
||||
box_top = min(y_coords) / ZM
|
||||
box_bottom = max(y_coords) / ZM
|
||||
new_box = {
|
||||
"text": text,
|
||||
"x0": box_x0 + table_x0,
|
||||
"x1": box_x1 + table_x0,
|
||||
"top": box_top + table_top + self.page_cum_height[page_index],
|
||||
"bottom": box_bottom + table_top + self.page_cum_height[page_index],
|
||||
"page_number": page_index + self.page_from,
|
||||
"layout_type": "table",
|
||||
"layoutno": f"table-{table_index}",
|
||||
"_rotated": True,
|
||||
"_rotation_angle": best_angle,
|
||||
"_table_index": table_index,
|
||||
"_rotated_x0": box_x0,
|
||||
"_rotated_x1": box_x1,
|
||||
"_rotated_top": box_top,
|
||||
"_rotated_bottom": box_bottom,
|
||||
}
|
||||
self.boxes.insert(insert_at, new_box)
|
||||
insert_at += 1
|
||||
added += 1
|
||||
return added
|
||||
|
||||
for tbl_info in table_layouts:
|
||||
table_index = tbl_info["table_index"]
|
||||
page = tbl_info["page"]
|
||||
@ -450,82 +545,42 @@ class RAGFlowPdfParser:
|
||||
if rotated_img is None:
|
||||
continue
|
||||
|
||||
# If table was rotated, re-OCR the rotated image
|
||||
if best_angle != 0:
|
||||
logging.info(f"Re-OCR table {table_index} on page {page} with rotation {best_angle}°")
|
||||
# If no rotation, keep original OCR boxes untouched.
|
||||
if best_angle == 0:
|
||||
continue
|
||||
|
||||
# Perform OCR on rotated image
|
||||
img_array = np.array(rotated_img)
|
||||
ocr_results = self.ocr(img_array)
|
||||
# Table region is defined by layout's x0, top, x1, bottom (page-local coords)
|
||||
table_x0, table_top, table_x1, table_bottom, table_top_cum, table_bottom_cum = _table_region(layout, page)
|
||||
original_boxes, insert_at = _collect_table_boxes(page, table_x0, table_x1, table_top_cum, table_bottom_cum)
|
||||
|
||||
if not ocr_results:
|
||||
logging.warning(f"No OCR results for rotated table {table_index}")
|
||||
continue
|
||||
logging.info(f"Re-OCR table {table_index} on page {page} with rotation {best_angle}°")
|
||||
|
||||
# Remove original text boxes from this table region in self.boxes
|
||||
# Table region is defined by layout's x0, top, x1, bottom
|
||||
table_x0 = layout["x0"]
|
||||
table_top = layout["top"]
|
||||
table_x1 = layout["x1"]
|
||||
table_bottom = layout["bottom"]
|
||||
# Perform OCR on rotated image
|
||||
img_array = np.array(rotated_img)
|
||||
ocr_results = self.ocr(img_array)
|
||||
|
||||
# Filter out original boxes within the table region
|
||||
original_box_count = len(self.boxes)
|
||||
self.boxes = [
|
||||
b
|
||||
for b in self.boxes
|
||||
if not (
|
||||
b.get("page_number") == page + self.page_from
|
||||
and b.get("layout_type") == "table"
|
||||
and b["x0"] >= table_x0 - 5
|
||||
and b["x1"] <= table_x1 + 5
|
||||
and b["top"] >= table_top - 5
|
||||
and b["bottom"] <= table_bottom + 5
|
||||
)
|
||||
]
|
||||
removed_count = original_box_count - len(self.boxes)
|
||||
logging.debug(f"Removed {removed_count} original boxes from table {table_index}")
|
||||
if not ocr_results:
|
||||
logging.warning(f"No OCR results for rotated table {table_index}, restoring originals")
|
||||
_restore_boxes(original_boxes, insert_at)
|
||||
continue
|
||||
|
||||
# Add new OCR results to self.boxes
|
||||
# OCR coordinates are relative to rotated image, need to preserve
|
||||
rotated_width, rotated_height = rotated_img.size
|
||||
# Add new OCR results to self.boxes
|
||||
# OCR coordinates are relative to rotated image, map back to original table coords
|
||||
table_w_px = right - left
|
||||
table_h_px = bott - top
|
||||
added = _insert_ocr_boxes(
|
||||
ocr_results,
|
||||
page,
|
||||
table_x0,
|
||||
table_top,
|
||||
insert_at,
|
||||
table_index,
|
||||
best_angle,
|
||||
table_w_px,
|
||||
table_h_px,
|
||||
)
|
||||
|
||||
for bbox, (text, conf) in ocr_results:
|
||||
if conf < 0.5: # Filter low confidence results
|
||||
continue
|
||||
|
||||
# bbox format: [[x1,y1], [x2,y2], [x3,y3], [x4,y4]]
|
||||
x_coords = [p[0] for p in bbox]
|
||||
y_coords = [p[1] for p in bbox]
|
||||
|
||||
# Coordinates in rotated image
|
||||
box_x0 = min(x_coords) / ZM
|
||||
box_x1 = max(x_coords) / ZM
|
||||
box_top = min(y_coords) / ZM
|
||||
box_bottom = max(y_coords) / ZM
|
||||
|
||||
# Create new box, mark as from rotated table
|
||||
new_box = {
|
||||
"text": text,
|
||||
"x0": box_x0 + table_x0, # Coordinates relative to page
|
||||
"x1": box_x1 + table_x0,
|
||||
"top": box_top + table_top + self.page_cum_height[page],
|
||||
"bottom": box_bottom + table_top + self.page_cum_height[page],
|
||||
"page_number": page + self.page_from,
|
||||
"layout_type": "table",
|
||||
"layoutno": f"table-{table_index}",
|
||||
"_rotated": True,
|
||||
"_rotation_angle": best_angle,
|
||||
"_table_index": table_index,
|
||||
# Save original coordinates in rotated image for table reconstruction
|
||||
"_rotated_x0": box_x0,
|
||||
"_rotated_x1": box_x1,
|
||||
"_rotated_top": box_top,
|
||||
"_rotated_bottom": box_bottom,
|
||||
}
|
||||
self.boxes.append(new_box)
|
||||
|
||||
logging.info(f"Added {len(ocr_results)} OCR results from rotated table {table_index}")
|
||||
logging.info(f"Added {added} OCR results from rotated table {table_index}")
|
||||
|
||||
def __ocr(self, pagenum, img, chars, ZM=3, device_id: int | None = None):
|
||||
start = timer()
|
||||
@ -1106,7 +1161,7 @@ class RAGFlowPdfParser:
|
||||
max_page_index = len(self.page_images) - 1
|
||||
|
||||
def local_page_index(page_number):
|
||||
idx = page_number - 1
|
||||
idx = page_number - 1 if page_number > 0 else 0
|
||||
if idx > max_page_index and self.page_from:
|
||||
idx = page_number - 1 - self.page_from
|
||||
return idx
|
||||
|
||||
@ -677,9 +677,10 @@ Failure:
|
||||
|
||||
```json
|
||||
{
|
||||
"code": 102,
|
||||
"message": "You don't own the dataset."
|
||||
"code":108,
|
||||
"message":"User '<tenant_id>' lacks permission for datasets: '<dataset_ids>'"
|
||||
}
|
||||
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
@ -7,7 +7,6 @@ export const confluenceConstant = (t: TFunction) => [
|
||||
name: 'config.credentials.confluence_username',
|
||||
type: FormFieldType.Text,
|
||||
required: true,
|
||||
tooltip: t('setting.connectorNameTip'),
|
||||
},
|
||||
{
|
||||
label: 'Confluence Access Token',
|
||||
|
||||
@ -200,6 +200,7 @@ export const DataSourceFormBaseFields = [
|
||||
name: 'name',
|
||||
type: FormFieldType.Text,
|
||||
required: true,
|
||||
tooltip: t('setting.connectorNameTip'),
|
||||
},
|
||||
{
|
||||
label: 'Source',
|
||||
|
||||
Reference in New Issue
Block a user