mirror of
https://github.com/infiniflow/ragflow.git
synced 2025-12-08 20:42:30 +08:00
Upgrades Document Layout Analysis model. (#4054)
### What problem does this PR solve? #4052 ### Type of change - [x] New Feature (non-breaking change which adds functionality)
This commit is contained in:
@ -14,11 +14,14 @@ import os
|
||||
import re
|
||||
from collections import Counter
|
||||
from copy import deepcopy
|
||||
|
||||
import cv2
|
||||
import numpy as np
|
||||
from huggingface_hub import snapshot_download
|
||||
|
||||
from api.utils.file_utils import get_project_base_directory
|
||||
from deepdoc.vision import Recognizer
|
||||
from deepdoc.vision.operators import nms
|
||||
|
||||
|
||||
class LayoutRecognizer(Recognizer):
|
||||
@ -149,3 +152,88 @@ class LayoutRecognizer(Recognizer):
|
||||
|
||||
ocr_res = [b for b in ocr_res if b["text"].strip() not in garbag_set]
|
||||
return ocr_res, page_layout
|
||||
|
||||
|
||||
class LayoutRecognizer4YOLOv10(LayoutRecognizer):
|
||||
labels = [
|
||||
"title",
|
||||
"Text",
|
||||
"Reference",
|
||||
"Figure",
|
||||
"Figure caption",
|
||||
"Table",
|
||||
"Table caption",
|
||||
"Table caption",
|
||||
"Equation",
|
||||
"Figure caption",
|
||||
]
|
||||
|
||||
def __init__(self, domain):
|
||||
domain = "layout"
|
||||
super().__init__(domain)
|
||||
self.auto = False
|
||||
self.scaleFill = False
|
||||
self.scaleup = True
|
||||
self.stride = 32
|
||||
self.center = True
|
||||
|
||||
def preprocess(self, image_list):
|
||||
inputs = []
|
||||
new_shape = self.input_shape # height, width
|
||||
for img in image_list:
|
||||
shape = img.shape[:2]# current shape [height, width]
|
||||
# Scale ratio (new / old)
|
||||
r = min(new_shape[0] / shape[0], new_shape[1] / shape[1])
|
||||
# Compute padding
|
||||
new_unpad = int(round(shape[1] * r)), int(round(shape[0] * r))
|
||||
dw, dh = new_shape[1] - new_unpad[0], new_shape[0] - new_unpad[1] # wh padding
|
||||
dw /= 2 # divide padding into 2 sides
|
||||
dh /= 2
|
||||
ww, hh = new_unpad
|
||||
img = np.array(cv2.cvtColor(img, cv2.COLOR_BGR2RGB)).astype(np.float32)
|
||||
img = cv2.resize(img, new_unpad, interpolation=cv2.INTER_LINEAR)
|
||||
top, bottom = int(round(dh - 0.1)) if self.center else 0, int(round(dh + 0.1))
|
||||
left, right = int(round(dw - 0.1)) if self.center else 0, int(round(dw + 0.1))
|
||||
img = cv2.copyMakeBorder(
|
||||
img, top, bottom, left, right, cv2.BORDER_CONSTANT, value=(114, 114, 114)
|
||||
) # add border
|
||||
img /= 255.0
|
||||
img = img.transpose(2, 0, 1)
|
||||
img = img[np.newaxis, :, :, :].astype(np.float32)
|
||||
inputs.append({self.input_names[0]: img, "scale_factor": [shape[1]/ww, shape[0]/hh, dw, dh]})
|
||||
|
||||
return inputs
|
||||
|
||||
def postprocess(self, boxes, inputs, thr):
|
||||
thr = 0.08
|
||||
boxes = np.squeeze(boxes)
|
||||
scores = boxes[:, 4]
|
||||
boxes = boxes[scores > thr, :]
|
||||
scores = scores[scores > thr]
|
||||
if len(boxes) == 0:
|
||||
return []
|
||||
class_ids = boxes[:, -1].astype(int)
|
||||
boxes = boxes[:, :4]
|
||||
boxes[:, 0] -= inputs["scale_factor"][2]
|
||||
boxes[:, 2] -= inputs["scale_factor"][2]
|
||||
boxes[:, 1] -= inputs["scale_factor"][3]
|
||||
boxes[:, 3] -= inputs["scale_factor"][3]
|
||||
input_shape = np.array([inputs["scale_factor"][0], inputs["scale_factor"][1], inputs["scale_factor"][0],
|
||||
inputs["scale_factor"][1]])
|
||||
boxes = np.multiply(boxes, input_shape, dtype=np.float32)
|
||||
|
||||
unique_class_ids = np.unique(class_ids)
|
||||
indices = []
|
||||
for class_id in unique_class_ids:
|
||||
class_indices = np.where(class_ids == class_id)[0]
|
||||
class_boxes = boxes[class_indices, :]
|
||||
class_scores = scores[class_indices]
|
||||
class_keep_boxes = nms(class_boxes, class_scores, 0.45)
|
||||
indices.extend(class_indices[class_keep_boxes])
|
||||
|
||||
return [{
|
||||
"type": self.label_list[class_ids[i]].lower(),
|
||||
"bbox": [float(t) for t in boxes[i].tolist()],
|
||||
"score": float(scores[i])
|
||||
} for i in indices]
|
||||
|
||||
|
||||
Reference in New Issue
Block a user