Fix: optimize OCR garbage identification to reduce unnecessary filtering (#6027)

### What problem does this PR solve?

Optimize OCR garbage identification to reduce unnecessary filtering.
#5713

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
This commit is contained in:
Yongteng Lei
2025-03-13 18:48:32 +08:00
committed by GitHub
parent 0a877941f4
commit 4ff609b6a8
2 changed files with 18 additions and 18 deletions

View File

@ -46,8 +46,8 @@ class LayoutRecognizer(Recognizer):
def __init__(self, domain):
try:
model_dir = os.path.join(
get_project_base_directory(),
"rag/res/deepdoc")
get_project_base_directory(),
"rag/res/deepdoc")
super().__init__(self.labels, domain, model_dir)
except Exception:
model_dir = snapshot_download(repo_id="InfiniFlow/deepdoc",
@ -60,9 +60,8 @@ class LayoutRecognizer(Recognizer):
def __call__(self, image_list, ocr_res, scale_factor=3,
thr=0.2, batch_size=16, drop=True):
def __is_garbage(b):
patt = [r"^•+$", r"(版权归©|免责条款|地址[:])", r"\.{3,}", "^[0-9]{1,2} / ?[0-9]{1,2}$",
patt = [r"^•+$", "^[0-9]{1,2} / ?[0-9]{1,2}$",
r"^[0-9]{1,2} of [0-9]{1,2}$", "^http://[^ ]{12,}",
"(资料|数据)来源[:]", "[0-9a-z._-]+@[a-z0-9-]+\\.[a-z]{2,3}",
"\\(cid *: *[0-9]+ *\\)"
]
return any([re.search(p, b["text"]) for p in patt])
@ -160,6 +159,7 @@ class LayoutRecognizer(Recognizer):
def forward(self, image_list, thr=0.7, batch_size=16):
return super().__call__(image_list, thr, batch_size)
class LayoutRecognizer4YOLOv10(LayoutRecognizer):
labels = [
"title",
@ -185,9 +185,9 @@ class LayoutRecognizer4YOLOv10(LayoutRecognizer):
def preprocess(self, image_list):
inputs = []
new_shape = self.input_shape # height, width
new_shape = self.input_shape # height, width
for img in image_list:
shape = img.shape[:2]# current shape [height, width]
shape = img.shape[:2] # current shape [height, width]
# Scale ratio (new / old)
r = min(new_shape[0] / shape[0], new_shape[1] / shape[1])
# Compute padding
@ -242,4 +242,3 @@ class LayoutRecognizer4YOLOv10(LayoutRecognizer):
"bbox": [float(t) for t in boxes[i].tolist()],
"score": float(scores[i])
} for i in indices]