diff --git a/api/apps/connector_app.py b/api/apps/connector_app.py index fb074419b..0e687ea69 100644 --- a/api/apps/connector_app.py +++ b/api/apps/connector_app.py @@ -52,7 +52,7 @@ async def set_connector(): "source": req["source"], "input_type": InputType.POLL, "config": req["config"], - "refresh_freq": int(req.get("refresh_freq", 30)), + "refresh_freq": int(req.get("refresh_freq", 5)), "prune_freq": int(req.get("prune_freq", 720)), "timeout_secs": int(req.get("timeout_secs", 60 * 29)), "status": TaskStatus.SCHEDULE, diff --git a/deepdoc/parser/pdf_parser.py b/deepdoc/parser/pdf_parser.py index 352a63414..6681e4a89 100644 --- a/deepdoc/parser/pdf_parser.py +++ b/deepdoc/parser/pdf_parser.py @@ -226,6 +226,7 @@ class RAGFlowPdfParser: best_score = -1 best_angle = 0 best_img = table_img + score_0 = None for angle, name in rotations: # Rotate image @@ -263,6 +264,8 @@ class RAGFlowPdfParser: combined_score = 0 results[angle] = {"avg_confidence": avg_score, "total_regions": total_regions, "combined_score": combined_score} + if angle == 0: + score_0 = combined_score logging.debug(f"Table orientation {angle}°: avg_conf={avg_score:.4f}, regions={total_regions}, combined={combined_score:.4f}") @@ -271,6 +274,16 @@ class RAGFlowPdfParser: best_angle = angle best_img = rotated_img + # Absolute threshold rule: + # Only choose non-0° if it exceeds 0° by more than 0.2 and 0° score is below 0.8. + if best_angle != 0 and score_0 is not None: + if not (best_score - score_0 > 0.2 and score_0 < 0.8): + best_angle = 0 + best_img = table_img + best_score = score_0 + + results[best_angle] = results.get(best_angle, {"avg_confidence": 0, "total_regions": 0, "combined_score": 0}) + logging.info(f"Best table orientation: {best_angle}° (score={best_score:.4f})") return best_angle, best_img, results @@ -340,8 +353,6 @@ class RAGFlowPdfParser: self.rotated_table_imgs[table_index] = rotated_img imgs.append(rotated_img) - if best_angle != 0: - logging.info(f"Table {table_index} on page {p}: rotated {best_angle}° for better recognition") else: imgs.append(table_img) self.table_rotations[table_index] = {"page": p, "original_pos": (left, top, right, bott), "best_angle": 0, "scores": {}, "rotated_size": table_img.size} @@ -436,6 +447,90 @@ class RAGFlowPdfParser: """ tbcnt = np.cumsum(tbcnt) + def _table_region(layout, page_index): + table_x0 = layout["x0"] + table_top = layout["top"] + table_x1 = layout["x1"] + table_bottom = layout["bottom"] + table_top_cum = table_top + self.page_cum_height[page_index] + table_bottom_cum = table_bottom + self.page_cum_height[page_index] + return table_x0, table_top, table_x1, table_bottom, table_top_cum, table_bottom_cum + + def _collect_table_boxes(page_index, table_x0, table_x1, table_top_cum, table_bottom_cum): + indices = [ + i + for i, b in enumerate(self.boxes) + if ( + b.get("page_number") == page_index + self.page_from + and b.get("layout_type") == "table" + and b["x0"] >= table_x0 - 5 + and b["x1"] <= table_x1 + 5 + and b["top"] >= table_top_cum - 5 + and b["bottom"] <= table_bottom_cum + 5 + ) + ] + original_boxes = [self.boxes[i] for i in indices] + insert_at = indices[0] if indices else len(self.boxes) + for i in reversed(indices): + self.boxes.pop(i) + return original_boxes, insert_at + + def _restore_boxes(original_boxes, insert_at): + for b in original_boxes: + self.boxes.insert(insert_at, b) + insert_at += 1 + return insert_at + + def _map_rotated_point(x, y, angle, width, height): + # Map a point from rotated image coords back to original image coords. + if angle == 0: + return x, y + if angle == 90: + # clockwise 90: original->rotated (x', y') = (y, width - x) + # inverse: + return width - y, x + if angle == 180: + return width - x, height - y + if angle == 270: + # clockwise 270: original->rotated (x', y') = (height - y, x) + # inverse: + return y, height - x + return x, y + + def _insert_ocr_boxes(ocr_results, page_index, table_x0, table_top, insert_at, table_index, best_angle, table_w_px, table_h_px): + added = 0 + for bbox, (text, conf) in ocr_results: + if conf < 0.5: + continue + mapped = [_map_rotated_point(p[0], p[1], best_angle, table_w_px, table_h_px) for p in bbox] + x_coords = [p[0] for p in mapped] + y_coords = [p[1] for p in mapped] + box_x0 = min(x_coords) / ZM + box_x1 = max(x_coords) / ZM + box_top = min(y_coords) / ZM + box_bottom = max(y_coords) / ZM + new_box = { + "text": text, + "x0": box_x0 + table_x0, + "x1": box_x1 + table_x0, + "top": box_top + table_top + self.page_cum_height[page_index], + "bottom": box_bottom + table_top + self.page_cum_height[page_index], + "page_number": page_index + self.page_from, + "layout_type": "table", + "layoutno": f"table-{table_index}", + "_rotated": True, + "_rotation_angle": best_angle, + "_table_index": table_index, + "_rotated_x0": box_x0, + "_rotated_x1": box_x1, + "_rotated_top": box_top, + "_rotated_bottom": box_bottom, + } + self.boxes.insert(insert_at, new_box) + insert_at += 1 + added += 1 + return added + for tbl_info in table_layouts: table_index = tbl_info["table_index"] page = tbl_info["page"] @@ -450,82 +545,42 @@ class RAGFlowPdfParser: if rotated_img is None: continue - # If table was rotated, re-OCR the rotated image - if best_angle != 0: - logging.info(f"Re-OCR table {table_index} on page {page} with rotation {best_angle}°") + # If no rotation, keep original OCR boxes untouched. + if best_angle == 0: + continue - # Perform OCR on rotated image - img_array = np.array(rotated_img) - ocr_results = self.ocr(img_array) + # Table region is defined by layout's x0, top, x1, bottom (page-local coords) + table_x0, table_top, table_x1, table_bottom, table_top_cum, table_bottom_cum = _table_region(layout, page) + original_boxes, insert_at = _collect_table_boxes(page, table_x0, table_x1, table_top_cum, table_bottom_cum) - if not ocr_results: - logging.warning(f"No OCR results for rotated table {table_index}") - continue + logging.info(f"Re-OCR table {table_index} on page {page} with rotation {best_angle}°") - # Remove original text boxes from this table region in self.boxes - # Table region is defined by layout's x0, top, x1, bottom - table_x0 = layout["x0"] - table_top = layout["top"] - table_x1 = layout["x1"] - table_bottom = layout["bottom"] + # Perform OCR on rotated image + img_array = np.array(rotated_img) + ocr_results = self.ocr(img_array) - # Filter out original boxes within the table region - original_box_count = len(self.boxes) - self.boxes = [ - b - for b in self.boxes - if not ( - b.get("page_number") == page + self.page_from - and b.get("layout_type") == "table" - and b["x0"] >= table_x0 - 5 - and b["x1"] <= table_x1 + 5 - and b["top"] >= table_top - 5 - and b["bottom"] <= table_bottom + 5 - ) - ] - removed_count = original_box_count - len(self.boxes) - logging.debug(f"Removed {removed_count} original boxes from table {table_index}") + if not ocr_results: + logging.warning(f"No OCR results for rotated table {table_index}, restoring originals") + _restore_boxes(original_boxes, insert_at) + continue - # Add new OCR results to self.boxes - # OCR coordinates are relative to rotated image, need to preserve - rotated_width, rotated_height = rotated_img.size + # Add new OCR results to self.boxes + # OCR coordinates are relative to rotated image, map back to original table coords + table_w_px = right - left + table_h_px = bott - top + added = _insert_ocr_boxes( + ocr_results, + page, + table_x0, + table_top, + insert_at, + table_index, + best_angle, + table_w_px, + table_h_px, + ) - for bbox, (text, conf) in ocr_results: - if conf < 0.5: # Filter low confidence results - continue - - # bbox format: [[x1,y1], [x2,y2], [x3,y3], [x4,y4]] - x_coords = [p[0] for p in bbox] - y_coords = [p[1] for p in bbox] - - # Coordinates in rotated image - box_x0 = min(x_coords) / ZM - box_x1 = max(x_coords) / ZM - box_top = min(y_coords) / ZM - box_bottom = max(y_coords) / ZM - - # Create new box, mark as from rotated table - new_box = { - "text": text, - "x0": box_x0 + table_x0, # Coordinates relative to page - "x1": box_x1 + table_x0, - "top": box_top + table_top + self.page_cum_height[page], - "bottom": box_bottom + table_top + self.page_cum_height[page], - "page_number": page + self.page_from, - "layout_type": "table", - "layoutno": f"table-{table_index}", - "_rotated": True, - "_rotation_angle": best_angle, - "_table_index": table_index, - # Save original coordinates in rotated image for table reconstruction - "_rotated_x0": box_x0, - "_rotated_x1": box_x1, - "_rotated_top": box_top, - "_rotated_bottom": box_bottom, - } - self.boxes.append(new_box) - - logging.info(f"Added {len(ocr_results)} OCR results from rotated table {table_index}") + logging.info(f"Added {added} OCR results from rotated table {table_index}") def __ocr(self, pagenum, img, chars, ZM=3, device_id: int | None = None): start = timer() @@ -1106,7 +1161,7 @@ class RAGFlowPdfParser: max_page_index = len(self.page_images) - 1 def local_page_index(page_number): - idx = page_number - 1 + idx = page_number - 1 if page_number > 0 else 0 if idx > max_page_index and self.page_from: idx = page_number - 1 - self.page_from return idx diff --git a/docs/references/http_api_reference.md b/docs/references/http_api_reference.md index 5ad8132d0..f493583a9 100644 --- a/docs/references/http_api_reference.md +++ b/docs/references/http_api_reference.md @@ -677,9 +677,10 @@ Failure: ```json { - "code": 102, - "message": "You don't own the dataset." + "code":108, + "message":"User '' lacks permission for datasets: ''" } + ``` --- diff --git a/web/src/pages/user-setting/data-source/constant/confluence-constant.tsx b/web/src/pages/user-setting/data-source/constant/confluence-constant.tsx index 48e2da47c..4aa70494e 100644 --- a/web/src/pages/user-setting/data-source/constant/confluence-constant.tsx +++ b/web/src/pages/user-setting/data-source/constant/confluence-constant.tsx @@ -7,7 +7,6 @@ export const confluenceConstant = (t: TFunction) => [ name: 'config.credentials.confluence_username', type: FormFieldType.Text, required: true, - tooltip: t('setting.connectorNameTip'), }, { label: 'Confluence Access Token', diff --git a/web/src/pages/user-setting/data-source/constant/index.tsx b/web/src/pages/user-setting/data-source/constant/index.tsx index 986bc3e60..4b4f8cb75 100644 --- a/web/src/pages/user-setting/data-source/constant/index.tsx +++ b/web/src/pages/user-setting/data-source/constant/index.tsx @@ -200,6 +200,7 @@ export const DataSourceFormBaseFields = [ name: 'name', type: FormFieldType.Text, required: true, + tooltip: t('setting.connectorNameTip'), }, { label: 'Source',