Fix: azure OpenAI retry (#10213)

### What problem does this PR solve?

Currently, Azure OpenAI returns one minute Quota limit responses when
chat API is utilized. This change is needed in order to be able to
process almost any documents using models deployed in Azure Foundry.

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
This commit is contained in:
Dominik Novotný
2025-09-23 06:19:28 +02:00
committed by GitHub
parent 4eb7659499
commit 1c84d1b562

View File

@ -193,21 +193,30 @@ class Base(ABC):
return ans + LENGTH_NOTIFICATION_CN return ans + LENGTH_NOTIFICATION_CN
return ans + LENGTH_NOTIFICATION_EN return ans + LENGTH_NOTIFICATION_EN
def _exceptions(self, e, attempt): @property
def _retryable_errors(self) -> set[str]:
return {
LLMErrorCode.ERROR_RATE_LIMIT,
LLMErrorCode.ERROR_SERVER,
}
def _should_retry(self, error_code: str) -> bool:
return error_code in self._retryable_errors
def _exceptions(self, e, attempt) -> str | None:
logging.exception("OpenAI chat_with_tools") logging.exception("OpenAI chat_with_tools")
# Classify the error # Classify the error
error_code = self._classify_error(e) error_code = self._classify_error(e)
if attempt == self.max_retries: if attempt == self.max_retries:
error_code = LLMErrorCode.ERROR_MAX_RETRIES error_code = LLMErrorCode.ERROR_MAX_RETRIES
# Check if it's a rate limit error or server error and not the last attempt if self._should_retry(error_code):
should_retry = error_code == LLMErrorCode.ERROR_RATE_LIMIT or error_code == LLMErrorCode.ERROR_SERVER
if not should_retry:
return f"{ERROR_PREFIX}: {error_code} - {str(e)}"
delay = self._get_delay() delay = self._get_delay()
logging.warning(f"Error: {error_code}. Retrying in {delay:.2f} seconds... (Attempt {attempt + 1}/{self.max_retries})") logging.warning(f"Error: {error_code}. Retrying in {delay:.2f} seconds... (Attempt {attempt + 1}/{self.max_retries})")
time.sleep(delay) time.sleep(delay)
return None
return f"{ERROR_PREFIX}: {error_code} - {str(e)}"
def _verbose_tool_use(self, name, args, res): def _verbose_tool_use(self, name, args, res):
return "<tool_call>" + json.dumps({"name": name, "args": args, "result": res}, ensure_ascii=False, indent=2) + "</tool_call>" return "<tool_call>" + json.dumps({"name": name, "args": args, "result": res}, ensure_ascii=False, indent=2) + "</tool_call>"
@ -536,6 +545,14 @@ class AzureChat(Base):
self.client = AzureOpenAI(api_key=api_key, azure_endpoint=base_url, api_version=api_version) self.client = AzureOpenAI(api_key=api_key, azure_endpoint=base_url, api_version=api_version)
self.model_name = model_name self.model_name = model_name
@property
def _retryable_errors(self) -> set[str]:
return {
LLMErrorCode.ERROR_RATE_LIMIT,
LLMErrorCode.ERROR_SERVER,
LLMErrorCode.ERROR_QUOTA,
}
class BaiChuanChat(Base): class BaiChuanChat(Base):
_FACTORY_NAME = "BaiChuan" _FACTORY_NAME = "BaiChuan"
@ -1424,21 +1441,30 @@ class LiteLLMBase(ABC):
return ans + LENGTH_NOTIFICATION_CN return ans + LENGTH_NOTIFICATION_CN
return ans + LENGTH_NOTIFICATION_EN return ans + LENGTH_NOTIFICATION_EN
def _exceptions(self, e, attempt): @property
def _retryable_errors(self) -> set[str]:
return {
LLMErrorCode.ERROR_RATE_LIMIT,
LLMErrorCode.ERROR_SERVER,
}
def _should_retry(self, error_code: str) -> bool:
return error_code in self._retryable_errors
def _exceptions(self, e, attempt) -> str | None:
logging.exception("OpenAI chat_with_tools") logging.exception("OpenAI chat_with_tools")
# Classify the error # Classify the error
error_code = self._classify_error(e) error_code = self._classify_error(e)
if attempt == self.max_retries: if attempt == self.max_retries:
error_code = LLMErrorCode.ERROR_MAX_RETRIES error_code = LLMErrorCode.ERROR_MAX_RETRIES
# Check if it's a rate limit error or server error and not the last attempt if self._should_retry(error_code):
should_retry = error_code == LLMErrorCode.ERROR_RATE_LIMIT or error_code == LLMErrorCode.ERROR_SERVER
if not should_retry:
return f"{ERROR_PREFIX}: {error_code} - {str(e)}"
delay = self._get_delay() delay = self._get_delay()
logging.warning(f"Error: {error_code}. Retrying in {delay:.2f} seconds... (Attempt {attempt + 1}/{self.max_retries})") logging.warning(f"Error: {error_code}. Retrying in {delay:.2f} seconds... (Attempt {attempt + 1}/{self.max_retries})")
time.sleep(delay) time.sleep(delay)
return None
return f"{ERROR_PREFIX}: {error_code} - {str(e)}"
def _verbose_tool_use(self, name, args, res): def _verbose_tool_use(self, name, args, res):
return "<tool_call>" + json.dumps({"name": name, "args": args, "result": res}, ensure_ascii=False, indent=2) + "</tool_call>" return "<tool_call>" + json.dumps({"name": name, "args": args, "result": res}, ensure_ascii=False, indent=2) + "</tool_call>"