From 678392c04013d59fd460cd20dcdf13a84b226f0f Mon Sep 17 00:00:00 2001
From: apps-lycusinc <apps@lycusinc.com>
Date: Fri, 23 Jan 2026 08:36:28 +0500
Subject: [PATCH] feat(deepdoc): add configurable ONNX thread counts and GPU
 memory shrinkage (#12777)

### What problem does this PR solve?

This PR addresses critical memory and CPU resource management issues in
high-concurrency environments (multi-worker setups):

GPU Memory Exhaustion (OOM): Currently, onnxruntime-gpu uses an
aggressive memory arena that does not effectively release VRAM back to
the system after a task completes. In multi-process worker setups ($WS >
4), this leads to BFCArena allocation failures and OOM errors as workers
"hoard" VRAM even when idle. This PR introduces an optional GPU Memory
Arena Shrinkage toggle to mitigate this issue.

CPU Oversubscription: ONNX intra_op and inter_op thread counts are
currently hardcoded to 2. When running many workers, this causes
significant CPU context-switching overhead and degrades performance.
This PR makes these values configurable to match the host's actual CPU
core density.

Multi-GPU Support: The memory management logic has been improved to
dynamically target the correct device_id, ensuring stability on systems
with multiple GPUs.

Transparency: Added detailed initialization logs to help administrators
verify and troubleshoot their ONNX session configurations.


### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)

Co-authored-by: shakeel <shakeel@lollylaw.com>
---
 deepdoc/vision/ocr.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/deepdoc/vision/ocr.py b/deepdoc/vision/ocr.py
index afa692127..1f573bda5 100644
--- a/deepdoc/vision/ocr.py
+++ b/deepdoc/vision/ocr.py
@@ -96,8 +96,9 @@ def load_model(model_dir, nm, device_id: int | None = None):
     options = ort.SessionOptions()
     options.enable_cpu_mem_arena = False
     options.execution_mode = ort.ExecutionMode.ORT_SEQUENTIAL
-    options.intra_op_num_threads = 2
-    options.inter_op_num_threads = 2
+    # Prevent CPU oversubscription by allowing explicit thread control in multi-worker environments
+    options.intra_op_num_threads = int(os.environ.get("OCR_INTRA_OP_NUM_THREADS", "2"))
+    options.inter_op_num_threads = int(os.environ.get("OCR_INTER_OP_NUM_THREADS", "2"))
 
     # https://github.com/microsoft/onnxruntime/issues/9509#issuecomment-951546580
     # Shrink GPU memory after execution
@@ -117,6 +118,11 @@ def load_model(model_dir, nm, device_id: int | None = None):
             providers=['CUDAExecutionProvider'],
             provider_options=[cuda_provider_options]
             )
+        # Explicit arena shrinkage for GPU to release VRAM back to the system after each run
+        if os.environ.get("OCR_GPUMEM_ARENA_SHRINKAGE") == "1":
+            run_options.add_run_config_entry("memory.enable_memory_arena_shrinkage", f"gpu:{provider_device_id}")
+            logging.info(
+                f"load_model {model_file_path} enabled GPU memory arena shrinkage on device {provider_device_id}")
         logging.info(f"load_model {model_file_path} uses GPU (device {provider_device_id}, gpu_mem_limit={cuda_provider_options['gpu_mem_limit']}, arena_strategy={arena_strategy})")
     else:
         sess = ort.InferenceSession(