mirror of
https://github.com/infiniflow/ragflow.git
synced 2025-12-08 12:32:30 +08:00
Fix: remove garbage filtering rules (#11567)
### What problem does this PR solve? change: remove garbage filtering rules ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue)
This commit is contained in:
@ -17,7 +17,7 @@
|
|||||||
import logging
|
import logging
|
||||||
import math
|
import math
|
||||||
import os
|
import os
|
||||||
import re
|
# import re
|
||||||
from collections import Counter
|
from collections import Counter
|
||||||
from copy import deepcopy
|
from copy import deepcopy
|
||||||
|
|
||||||
@ -62,8 +62,9 @@ class LayoutRecognizer(Recognizer):
|
|||||||
|
|
||||||
def __call__(self, image_list, ocr_res, scale_factor=3, thr=0.2, batch_size=16, drop=True):
|
def __call__(self, image_list, ocr_res, scale_factor=3, thr=0.2, batch_size=16, drop=True):
|
||||||
def __is_garbage(b):
|
def __is_garbage(b):
|
||||||
patt = [r"^•+$", "^[0-9]{1,2} / ?[0-9]{1,2}$", r"^[0-9]{1,2} of [0-9]{1,2}$", "^http://[^ ]{12,}", "\\(cid *: *[0-9]+ *\\)"]
|
return False
|
||||||
return any([re.search(p, b["text"]) for p in patt])
|
# patt = [r"^•+$", "^[0-9]{1,2} / ?[0-9]{1,2}$", r"^[0-9]{1,2} of [0-9]{1,2}$", "^http://[^ ]{12,}", "\\(cid *: *[0-9]+ *\\)"]
|
||||||
|
# return any([re.search(p, b["text"]) for p in patt])
|
||||||
|
|
||||||
if self.client:
|
if self.client:
|
||||||
layouts = self.client.predict(image_list)
|
layouts = self.client.predict(image_list)
|
||||||
|
|||||||
Reference in New Issue
Block a user