From a674338c219f2e266e8e6f300dbd257a0f960c7a Mon Sep 17 00:00:00 2001 From: buua436 <66937541+buua436@users.noreply.github.com> Date: Thu, 27 Nov 2025 17:54:49 +0800 Subject: [PATCH] Fix: remove garbage filtering rules (#11567) ### What problem does this PR solve? change: remove garbage filtering rules ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue) --- deepdoc/vision/layout_recognizer.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/deepdoc/vision/layout_recognizer.py b/deepdoc/vision/layout_recognizer.py index 8068cbc07..5b79e2bf5 100644 --- a/deepdoc/vision/layout_recognizer.py +++ b/deepdoc/vision/layout_recognizer.py @@ -17,7 +17,7 @@ import logging import math import os -import re +# import re from collections import Counter from copy import deepcopy @@ -62,8 +62,9 @@ class LayoutRecognizer(Recognizer): def __call__(self, image_list, ocr_res, scale_factor=3, thr=0.2, batch_size=16, drop=True): def __is_garbage(b): - patt = [r"^•+$", "^[0-9]{1,2} / ?[0-9]{1,2}$", r"^[0-9]{1,2} of [0-9]{1,2}$", "^http://[^ ]{12,}", "\\(cid *: *[0-9]+ *\\)"] - return any([re.search(p, b["text"]) for p in patt]) + return False + # patt = [r"^•+$", "^[0-9]{1,2} / ?[0-9]{1,2}$", r"^[0-9]{1,2} of [0-9]{1,2}$", "^http://[^ ]{12,}", "\\(cid *: *[0-9]+ *\\)"] + # return any([re.search(p, b["text"]) for p in patt]) if self.client: layouts = self.client.predict(image_list)