mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-01-23 11:36:38 +08:00
feat(deepdoc): add configurable ONNX thread counts and GPU memory shrinkage (#12777)
### What problem does this PR solve? This PR addresses critical memory and CPU resource management issues in high-concurrency environments (multi-worker setups): GPU Memory Exhaustion (OOM): Currently, onnxruntime-gpu uses an aggressive memory arena that does not effectively release VRAM back to the system after a task completes. In multi-process worker setups ($WS > 4), this leads to BFCArena allocation failures and OOM errors as workers "hoard" VRAM even when idle. This PR introduces an optional GPU Memory Arena Shrinkage toggle to mitigate this issue. CPU Oversubscription: ONNX intra_op and inter_op thread counts are currently hardcoded to 2. When running many workers, this causes significant CPU context-switching overhead and degrades performance. This PR makes these values configurable to match the host's actual CPU core density. Multi-GPU Support: The memory management logic has been improved to dynamically target the correct device_id, ensuring stability on systems with multiple GPUs. Transparency: Added detailed initialization logs to help administrators verify and troubleshoot their ONNX session configurations. ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue) Co-authored-by: shakeel <shakeel@lollylaw.com>
This commit is contained in:
@ -96,8 +96,9 @@ def load_model(model_dir, nm, device_id: int | None = None):
|
||||
options = ort.SessionOptions()
|
||||
options.enable_cpu_mem_arena = False
|
||||
options.execution_mode = ort.ExecutionMode.ORT_SEQUENTIAL
|
||||
options.intra_op_num_threads = 2
|
||||
options.inter_op_num_threads = 2
|
||||
# Prevent CPU oversubscription by allowing explicit thread control in multi-worker environments
|
||||
options.intra_op_num_threads = int(os.environ.get("OCR_INTRA_OP_NUM_THREADS", "2"))
|
||||
options.inter_op_num_threads = int(os.environ.get("OCR_INTER_OP_NUM_THREADS", "2"))
|
||||
|
||||
# https://github.com/microsoft/onnxruntime/issues/9509#issuecomment-951546580
|
||||
# Shrink GPU memory after execution
|
||||
@ -117,6 +118,11 @@ def load_model(model_dir, nm, device_id: int | None = None):
|
||||
providers=['CUDAExecutionProvider'],
|
||||
provider_options=[cuda_provider_options]
|
||||
)
|
||||
# Explicit arena shrinkage for GPU to release VRAM back to the system after each run
|
||||
if os.environ.get("OCR_GPUMEM_ARENA_SHRINKAGE") == "1":
|
||||
run_options.add_run_config_entry("memory.enable_memory_arena_shrinkage", f"gpu:{provider_device_id}")
|
||||
logging.info(
|
||||
f"load_model {model_file_path} enabled GPU memory arena shrinkage on device {provider_device_id}")
|
||||
logging.info(f"load_model {model_file_path} uses GPU (device {provider_device_id}, gpu_mem_limit={cuda_provider_options['gpu_mem_limit']}, arena_strategy={arena_strategy})")
|
||||
else:
|
||||
sess = ort.InferenceSession(
|
||||
|
||||
Reference in New Issue
Block a user