From 678392c04013d59fd460cd20dcdf13a84b226f0f Mon Sep 17 00:00:00 2001 From: apps-lycusinc Date: Fri, 23 Jan 2026 08:36:28 +0500 Subject: [PATCH] feat(deepdoc): add configurable ONNX thread counts and GPU memory shrinkage (#12777) ### What problem does this PR solve? This PR addresses critical memory and CPU resource management issues in high-concurrency environments (multi-worker setups): GPU Memory Exhaustion (OOM): Currently, onnxruntime-gpu uses an aggressive memory arena that does not effectively release VRAM back to the system after a task completes. In multi-process worker setups ($WS > 4), this leads to BFCArena allocation failures and OOM errors as workers "hoard" VRAM even when idle. This PR introduces an optional GPU Memory Arena Shrinkage toggle to mitigate this issue. CPU Oversubscription: ONNX intra_op and inter_op thread counts are currently hardcoded to 2. When running many workers, this causes significant CPU context-switching overhead and degrades performance. This PR makes these values configurable to match the host's actual CPU core density. Multi-GPU Support: The memory management logic has been improved to dynamically target the correct device_id, ensuring stability on systems with multiple GPUs. Transparency: Added detailed initialization logs to help administrators verify and troubleshoot their ONNX session configurations. ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue) Co-authored-by: shakeel --- deepdoc/vision/ocr.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/deepdoc/vision/ocr.py b/deepdoc/vision/ocr.py index afa692127..1f573bda5 100644 --- a/deepdoc/vision/ocr.py +++ b/deepdoc/vision/ocr.py @@ -96,8 +96,9 @@ def load_model(model_dir, nm, device_id: int | None = None): options = ort.SessionOptions() options.enable_cpu_mem_arena = False options.execution_mode = ort.ExecutionMode.ORT_SEQUENTIAL - options.intra_op_num_threads = 2 - options.inter_op_num_threads = 2 + # Prevent CPU oversubscription by allowing explicit thread control in multi-worker environments + options.intra_op_num_threads = int(os.environ.get("OCR_INTRA_OP_NUM_THREADS", "2")) + options.inter_op_num_threads = int(os.environ.get("OCR_INTER_OP_NUM_THREADS", "2")) # https://github.com/microsoft/onnxruntime/issues/9509#issuecomment-951546580 # Shrink GPU memory after execution @@ -117,6 +118,11 @@ def load_model(model_dir, nm, device_id: int | None = None): providers=['CUDAExecutionProvider'], provider_options=[cuda_provider_options] ) + # Explicit arena shrinkage for GPU to release VRAM back to the system after each run + if os.environ.get("OCR_GPUMEM_ARENA_SHRINKAGE") == "1": + run_options.add_run_config_entry("memory.enable_memory_arena_shrinkage", f"gpu:{provider_device_id}") + logging.info( + f"load_model {model_file_path} enabled GPU memory arena shrinkage on device {provider_device_id}") logging.info(f"load_model {model_file_path} uses GPU (device {provider_device_id}, gpu_mem_limit={cuda_provider_options['gpu_mem_limit']}, arena_strategy={arena_strategy})") else: sess = ort.InferenceSession(