Refa: PARALLEL_DEVICES is a static parameter. (#6168)

### What problem does this PR solve?


### Type of change

- [x] Refactoring
This commit is contained in:
Kevin Hu
2025-03-17 16:49:54 +08:00
committed by GitHub
parent 45fe02c8b3
commit 3a99c2b5f4
6 changed files with 29 additions and 28 deletions

View File

@ -37,13 +37,15 @@ from rag.nlp import rag_tokenizer
from copy import deepcopy
from huggingface_hub import snapshot_download
from rag.settings import PARALLEL_DEVICES
LOCK_KEY_pdfplumber = "global_shared_lock_pdfplumber"
if LOCK_KEY_pdfplumber not in sys.modules:
sys.modules[LOCK_KEY_pdfplumber] = threading.Lock()
class RAGFlowPdfParser:
def __init__(self, parallel_devices: int | None = None):
def __init__(self):
"""
If you have trouble downloading HuggingFace models, -_^ this might help!!
@ -56,11 +58,10 @@ class RAGFlowPdfParser:
"""
self.ocr = OCR(parallel_devices = parallel_devices)
self.parallel_devices = parallel_devices
self.ocr = OCR()
self.parallel_limiter = None
if parallel_devices is not None and parallel_devices > 1:
self.parallel_limiter = [trio.CapacityLimiter(1) for _ in range(parallel_devices)]
if PARALLEL_DEVICES is not None and PARALLEL_DEVICES > 1:
self.parallel_limiter = [trio.CapacityLimiter(1) for _ in range(PARALLEL_DEVICES)]
if hasattr(self, "model_speciess"):
self.layouter = LayoutRecognizer("layout." + self.model_speciess)
@ -1018,7 +1019,6 @@ class RAGFlowPdfParser:
self.pdf.close()
if not self.outlines:
logging.warning("Miss outlines")
logging.debug("Images converted.")
self.is_english = [re.search(r"[a-zA-Z0-9,/¸;:'\[\]\(\)!@#$%^&*\"?<>._-]{30,}", "".join(
@ -1066,8 +1066,8 @@ class RAGFlowPdfParser:
for i, img in enumerate(self.page_images):
chars = __ocr_preprocess()
nursery.start_soon(__img_ocr, i, i % self.parallel_devices, img, chars,
self.parallel_limiter[i % self.parallel_devices])
nursery.start_soon(__img_ocr, i, i % PARALLEL_DEVICES, img, chars,
self.parallel_limiter[i % PARALLEL_DEVICES])
await trio.sleep(0.1)
else:
for i, img in enumerate(self.page_images):