mirror of
https://github.com/infiniflow/ragflow.git
synced 2025-12-08 12:32:30 +08:00
Fix: remove garbage filtering rules (#11567)
### What problem does this PR solve? change: remove garbage filtering rules ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue)
This commit is contained in:
@ -17,7 +17,7 @@
|
||||
import logging
|
||||
import math
|
||||
import os
|
||||
import re
|
||||
# import re
|
||||
from collections import Counter
|
||||
from copy import deepcopy
|
||||
|
||||
@ -62,8 +62,9 @@ class LayoutRecognizer(Recognizer):
|
||||
|
||||
def __call__(self, image_list, ocr_res, scale_factor=3, thr=0.2, batch_size=16, drop=True):
|
||||
def __is_garbage(b):
|
||||
patt = [r"^•+$", "^[0-9]{1,2} / ?[0-9]{1,2}$", r"^[0-9]{1,2} of [0-9]{1,2}$", "^http://[^ ]{12,}", "\\(cid *: *[0-9]+ *\\)"]
|
||||
return any([re.search(p, b["text"]) for p in patt])
|
||||
return False
|
||||
# patt = [r"^•+$", "^[0-9]{1,2} / ?[0-9]{1,2}$", r"^[0-9]{1,2} of [0-9]{1,2}$", "^http://[^ ]{12,}", "\\(cid *: *[0-9]+ *\\)"]
|
||||
# return any([re.search(p, b["text"]) for p in patt])
|
||||
|
||||
if self.client:
|
||||
layouts = self.client.predict(image_list)
|
||||
|
||||
Reference in New Issue
Block a user