Fix OCR GPU provider mem limit handling (#10407)

### What problem does this PR solve?

- Running DeepDoc OCR on large PDFs inside the GPU docker-compose setup
would intermittently fail with
[ONNXRuntimeError] ... p2o.Clip.6 ... Available memory of 0 is smaller
than requested bytes ...
- Root cause: load_model() in deepdoc/vision/ocr.py treated
device_id=None as-is.
torch.cuda.device_count() > device_id then raised a TypeError, the
helper returned False, and ONNXRuntime quietly fell back to
CPUExecutionProvider with
the hard-coded 512 MB limit, which then triggered the allocator failure.
- Environment where this reproduces: Windows 11, AMD 5900x, 64 GB RAM,
RTX 3090 (24 GB), docker-compose-gpu.yml from upstream, default DeepDoc
+ GraphRAG
parser settings, ingesting heavy PDF such as 《内科学》(第10版).pdf (~180 MB).

  Fixes:

- Normalize device_id to 0 when it is None before calling any CUDA APIs,
so the GPU path is considered available.
- Allow configuring the CUDA provider’s memory cap via
OCR_GPU_MEM_LIMIT_MB (default 2048 MB) and expose
OCR_ARENA_EXTEND_STRATEGY; the calculated byte
  limit is logged to confirm the effective settings.

  After the change, ragflow_server.log shows for example
load_model ... uses GPU (device 0, gpu_mem_limit=21474836480,
arena_strategy=kNextPowerOfTwo) and the same document finishes OCR
without allocator errors.

  ### Type of change

  - [x] Bug Fix (non-breaking change which fixes an issue)
This commit is contained in:
XIANG LI
2025-10-10 11:03:12 +08:00
committed by GitHub
parent 8aabc2807c
commit f631073ac2

View File

@ -84,7 +84,8 @@ def load_model(model_dir, nm, device_id: int | None = None):
def cuda_is_available(): def cuda_is_available():
try: try:
import torch import torch
if torch.cuda.is_available() and torch.cuda.device_count() > device_id: target_id = 0 if device_id is None else device_id
if torch.cuda.is_available() and torch.cuda.device_count() > target_id:
return True return True
except Exception: except Exception:
return False return False
@ -100,10 +101,13 @@ def load_model(model_dir, nm, device_id: int | None = None):
# Shrink GPU memory after execution # Shrink GPU memory after execution
run_options = ort.RunOptions() run_options = ort.RunOptions()
if cuda_is_available(): if cuda_is_available():
gpu_mem_limit_mb = int(os.environ.get("OCR_GPU_MEM_LIMIT_MB", "2048"))
arena_strategy = os.environ.get("OCR_ARENA_EXTEND_STRATEGY", "kNextPowerOfTwo")
provider_device_id = 0 if device_id is None else device_id
cuda_provider_options = { cuda_provider_options = {
"device_id": device_id, # Use specific GPU "device_id": provider_device_id, # Use specific GPU
"gpu_mem_limit": 512 * 1024 * 1024, # Limit gpu memory "gpu_mem_limit": max(gpu_mem_limit_mb, 0) * 1024 * 1024,
"arena_extend_strategy": "kNextPowerOfTwo", # gpu memory allocation strategy "arena_extend_strategy": arena_strategy, # gpu memory allocation strategy
} }
sess = ort.InferenceSession( sess = ort.InferenceSession(
model_file_path, model_file_path,
@ -111,8 +115,8 @@ def load_model(model_dir, nm, device_id: int | None = None):
providers=['CUDAExecutionProvider'], providers=['CUDAExecutionProvider'],
provider_options=[cuda_provider_options] provider_options=[cuda_provider_options]
) )
run_options.add_run_config_entry("memory.enable_memory_arena_shrinkage", "gpu:" + str(device_id)) run_options.add_run_config_entry("memory.enable_memory_arena_shrinkage", "gpu:" + str(provider_device_id))
logging.info(f"load_model {model_file_path} uses GPU") logging.info(f"load_model {model_file_path} uses GPU (device {provider_device_id}, gpu_mem_limit={cuda_provider_options['gpu_mem_limit']}, arena_strategy={arena_strategy})")
else: else:
sess = ort.InferenceSession( sess = ort.InferenceSession(
model_file_path, model_file_path,