Fix: pdf chunking / table rotation (#12981)

### What problem does this PR solve? Fix: PDF chunking issue for single-page documents Refactor: Change the default refresh frequency to 5 Fix: Add a 0-degree threshold; require other rotation angles to exceed it by at least 0.2 Fix: Put connector name tips to correct place Fix: incorrect example response in delete datasets. ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue) - [x] Refactoring
2026-02-04 17:45:07 +08:00 · 2026-02-04 17:00:25 +08:00
parent 0470fc59b1
commit a37d287fad
5 changed files with 133 additions and 77 deletions
--- a/api/apps/connector_app.py
+++ b/api/apps/connector_app.py
@ -52,7 +52,7 @@ async def set_connector():
            "source": req["source"],
            "input_type": InputType.POLL,
            "config": req["config"],
-            "refresh_freq": int(req.get("refresh_freq", 30)),
+            "refresh_freq": int(req.get("refresh_freq", 5)),
            "prune_freq": int(req.get("prune_freq", 720)),
            "timeout_secs": int(req.get("timeout_secs", 60 * 29)),
            "status": TaskStatus.SCHEDULE,
--- a/deepdoc/parser/pdf_parser.py
+++ b/deepdoc/parser/pdf_parser.py
@ -226,6 +226,7 @@ class RAGFlowPdfParser:
        best_score = -1
        best_angle = 0
        best_img = table_img
+        score_0 = None

        for angle, name in rotations:
            # Rotate image
@ -263,6 +264,8 @@ class RAGFlowPdfParser:
                combined_score = 0

            results[angle] = {"avg_confidence": avg_score, "total_regions": total_regions, "combined_score": combined_score}
+            if angle == 0:
+                score_0 = combined_score

            logging.debug(f"Table orientation {angle}°: avg_conf={avg_score:.4f}, regions={total_regions}, combined={combined_score:.4f}")

@ -271,6 +274,16 @@ class RAGFlowPdfParser:
                best_angle = angle
                best_img = rotated_img

+        # Absolute threshold rule:
+        # Only choose non-0° if it exceeds 0° by more than 0.2 and 0° score is below 0.8.
+        if best_angle != 0 and score_0 is not None:
+            if not (best_score - score_0 > 0.2 and score_0 < 0.8):
+                best_angle = 0
+                best_img = table_img
+                best_score = score_0
+
+        results[best_angle] = results.get(best_angle, {"avg_confidence": 0, "total_regions": 0, "combined_score": 0})
+
        logging.info(f"Best table orientation: {best_angle}° (score={best_score:.4f})")

        return best_angle, best_img, results
@ -340,8 +353,6 @@ class RAGFlowPdfParser:
                    self.rotated_table_imgs[table_index] = rotated_img
                    imgs.append(rotated_img)

-                    if best_angle != 0:
-                        logging.info(f"Table {table_index} on page {p}: rotated {best_angle}° for better recognition")
                else:
                    imgs.append(table_img)
                    self.table_rotations[table_index] = {"page": p, "original_pos": (left, top, right, bott), "best_angle": 0, "scores": {}, "rotated_size": table_img.size}
@ -436,6 +447,90 @@ class RAGFlowPdfParser:
        """
        tbcnt = np.cumsum(tbcnt)

+        def _table_region(layout, page_index):
+            table_x0 = layout["x0"]
+            table_top = layout["top"]
+            table_x1 = layout["x1"]
+            table_bottom = layout["bottom"]
+            table_top_cum = table_top + self.page_cum_height[page_index]
+            table_bottom_cum = table_bottom + self.page_cum_height[page_index]
+            return table_x0, table_top, table_x1, table_bottom, table_top_cum, table_bottom_cum
+
+        def _collect_table_boxes(page_index, table_x0, table_x1, table_top_cum, table_bottom_cum):
+            indices = [
+                i
+                for i, b in enumerate(self.boxes)
+                if (
+                    b.get("page_number") == page_index + self.page_from
+                    and b.get("layout_type") == "table"
+                    and b["x0"] >= table_x0 - 5
+                    and b["x1"] <= table_x1 + 5
+                    and b["top"] >= table_top_cum - 5
+                    and b["bottom"] <= table_bottom_cum + 5
+                )
+            ]
+            original_boxes = [self.boxes[i] for i in indices]
+            insert_at = indices[0] if indices else len(self.boxes)
+            for i in reversed(indices):
+                self.boxes.pop(i)
+            return original_boxes, insert_at
+
+        def _restore_boxes(original_boxes, insert_at):
+            for b in original_boxes:
+                self.boxes.insert(insert_at, b)
+                insert_at += 1
+            return insert_at
+
+        def _map_rotated_point(x, y, angle, width, height):
+            # Map a point from rotated image coords back to original image coords.
+            if angle == 0:
+                return x, y
+            if angle == 90:
+                # clockwise 90: original->rotated (x', y') = (y, width - x)
+                # inverse:
+                return width - y, x
+            if angle == 180:
+                return width - x, height - y
+            if angle == 270:
+                # clockwise 270: original->rotated (x', y') = (height - y, x)
+                # inverse:
+                return y, height - x
+            return x, y
+
+        def _insert_ocr_boxes(ocr_results, page_index, table_x0, table_top, insert_at, table_index, best_angle, table_w_px, table_h_px):
+            added = 0
+            for bbox, (text, conf) in ocr_results:
+                if conf < 0.5:
+                    continue
+                mapped = [_map_rotated_point(p[0], p[1], best_angle, table_w_px, table_h_px) for p in bbox]
+                x_coords = [p[0] for p in mapped]
+                y_coords = [p[1] for p in mapped]
+                box_x0 = min(x_coords) / ZM
+                box_x1 = max(x_coords) / ZM
+                box_top = min(y_coords) / ZM
+                box_bottom = max(y_coords) / ZM
+                new_box = {
+                    "text": text,
+                    "x0": box_x0 + table_x0,
+                    "x1": box_x1 + table_x0,
+                    "top": box_top + table_top + self.page_cum_height[page_index],
+                    "bottom": box_bottom + table_top + self.page_cum_height[page_index],
+                    "page_number": page_index + self.page_from,
+                    "layout_type": "table",
+                    "layoutno": f"table-{table_index}",
+                    "_rotated": True,
+                    "_rotation_angle": best_angle,
+                    "_table_index": table_index,
+                    "_rotated_x0": box_x0,
+                    "_rotated_x1": box_x1,
+                    "_rotated_top": box_top,
+                    "_rotated_bottom": box_bottom,
+                }
+                self.boxes.insert(insert_at, new_box)
+                insert_at += 1
+                added += 1
+            return added
+
        for tbl_info in table_layouts:
            table_index = tbl_info["table_index"]
            page = tbl_info["page"]
@ -450,82 +545,42 @@ class RAGFlowPdfParser:
            if rotated_img is None:
                continue

-            # If table was rotated, re-OCR the rotated image
-            if best_angle != 0:
-                logging.info(f"Re-OCR table {table_index} on page {page} with rotation {best_angle}°")
+            # If no rotation, keep original OCR boxes untouched.
+            if best_angle == 0:
+                continue

-                # Perform OCR on rotated image
-                img_array = np.array(rotated_img)
-                ocr_results = self.ocr(img_array)
+            # Table region is defined by layout's x0, top, x1, bottom (page-local coords)
+            table_x0, table_top, table_x1, table_bottom, table_top_cum, table_bottom_cum = _table_region(layout, page)
+            original_boxes, insert_at = _collect_table_boxes(page, table_x0, table_x1, table_top_cum, table_bottom_cum)

-                if not ocr_results:
-                    logging.warning(f"No OCR results for rotated table {table_index}")
-                    continue
+            logging.info(f"Re-OCR table {table_index} on page {page} with rotation {best_angle}°")

-                # Remove original text boxes from this table region in self.boxes
-                # Table region is defined by layout's x0, top, x1, bottom
-                table_x0 = layout["x0"]
-                table_top = layout["top"]
-                table_x1 = layout["x1"]
-                table_bottom = layout["bottom"]
+            # Perform OCR on rotated image
+            img_array = np.array(rotated_img)
+            ocr_results = self.ocr(img_array)

-                # Filter out original boxes within the table region
-                original_box_count = len(self.boxes)
-                self.boxes = [
-                    b
-                    for b in self.boxes
-                    if not (
-                        b.get("page_number") == page + self.page_from
-                        and b.get("layout_type") == "table"
-                        and b["x0"] >= table_x0 - 5
-                        and b["x1"] <= table_x1 + 5
-                        and b["top"] >= table_top - 5
-                        and b["bottom"] <= table_bottom + 5
-                    )
-                ]
-                removed_count = original_box_count - len(self.boxes)
-                logging.debug(f"Removed {removed_count} original boxes from table {table_index}")
+            if not ocr_results:
+                logging.warning(f"No OCR results for rotated table {table_index}, restoring originals")
+                _restore_boxes(original_boxes, insert_at)
+                continue

-                # Add new OCR results to self.boxes
-                # OCR coordinates are relative to rotated image, need to preserve
-                rotated_width, rotated_height = rotated_img.size
+            # Add new OCR results to self.boxes
+            # OCR coordinates are relative to rotated image, map back to original table coords
+            table_w_px = right - left
+            table_h_px = bott - top
+            added = _insert_ocr_boxes(
+                ocr_results,
+                page,
+                table_x0,
+                table_top,
+                insert_at,
+                table_index,
+                best_angle,
+                table_w_px,
+                table_h_px,
+            )

-                for bbox, (text, conf) in ocr_results:
-                    if conf < 0.5:  # Filter low confidence results
-                        continue
-
-                    # bbox format: [[x1,y1], [x2,y2], [x3,y3], [x4,y4]]
-                    x_coords = [p[0] for p in bbox]
-                    y_coords = [p[1] for p in bbox]
-
-                    # Coordinates in rotated image
-                    box_x0 = min(x_coords) / ZM
-                    box_x1 = max(x_coords) / ZM
-                    box_top = min(y_coords) / ZM
-                    box_bottom = max(y_coords) / ZM
-
-                    # Create new box, mark as from rotated table
-                    new_box = {
-                        "text": text,
-                        "x0": box_x0 + table_x0,  # Coordinates relative to page
-                        "x1": box_x1 + table_x0,
-                        "top": box_top + table_top + self.page_cum_height[page],
-                        "bottom": box_bottom + table_top + self.page_cum_height[page],
-                        "page_number": page + self.page_from,
-                        "layout_type": "table",
-                        "layoutno": f"table-{table_index}",
-                        "_rotated": True,
-                        "_rotation_angle": best_angle,
-                        "_table_index": table_index,
-                        # Save original coordinates in rotated image for table reconstruction
-                        "_rotated_x0": box_x0,
-                        "_rotated_x1": box_x1,
-                        "_rotated_top": box_top,
-                        "_rotated_bottom": box_bottom,
-                    }
-                    self.boxes.append(new_box)
-
-                logging.info(f"Added {len(ocr_results)} OCR results from rotated table {table_index}")
+            logging.info(f"Added {added} OCR results from rotated table {table_index}")

    def __ocr(self, pagenum, img, chars, ZM=3, device_id: int | None = None):
        start = timer()
@ -1106,7 +1161,7 @@ class RAGFlowPdfParser:
            max_page_index = len(self.page_images) - 1

            def local_page_index(page_number):
-                idx = page_number - 1
+                idx = page_number - 1 if page_number > 0 else 0
                if idx > max_page_index and self.page_from:
                    idx = page_number - 1 - self.page_from
                return idx
--- a/docs/references/http_api_reference.md
+++ b/docs/references/http_api_reference.md
@ -677,9 +677,10 @@ Failure:

 ```json
 {
-    "code": 102,
-    "message": "You don't own the dataset."
+    "code":108,
+    "message":"User '<tenant_id>' lacks permission for datasets: '<dataset_ids>'"
 }
+
 ```

 ---
--- a/web/src/pages/user-setting/data-source/constant/confluence-constant.tsx
+++ b/web/src/pages/user-setting/data-source/constant/confluence-constant.tsx
@ -7,7 +7,6 @@ export const confluenceConstant = (t: TFunction) => [
    name: 'config.credentials.confluence_username',
    type: FormFieldType.Text,
    required: true,
-    tooltip: t('setting.connectorNameTip'),
  },
  {
    label: 'Confluence Access Token',
--- a/web/src/pages/user-setting/data-source/constant/index.tsx
+++ b/web/src/pages/user-setting/data-source/constant/index.tsx
@ -200,6 +200,7 @@ export const DataSourceFormBaseFields = [
    name: 'name',
    type: FormFieldType.Text,
    required: true,
+    tooltip: t('setting.connectorNameTip'),
  },
  {
    label: 'Source',