Skip to content

Commit fde7126

Browse files
authored
[misc] Add retries with exponential backoff for HF file existence check (#13008)
1 parent 2431371 commit fde7126

File tree

1 file changed

+48
-13
lines changed

1 file changed

+48
-13
lines changed

vllm/transformers_utils/config.py

Lines changed: 48 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
import enum
44
import json
55
import os
6+
import time
67
from pathlib import Path
78
from typing import Any, Dict, Literal, Optional, Type, Union
89

@@ -100,15 +101,33 @@ def file_or_path_exists(model: Union[str, Path], config_name: str,
100101

101102
# NB: file_exists will only check for the existence of the config file on
102103
# hf_hub. This will fail in offline mode.
103-
try:
104-
return file_exists(model,
105-
config_name,
106-
revision=revision,
107-
token=HF_TOKEN)
108-
except huggingface_hub.errors.OfflineModeIsEnabled:
109-
# Don't raise in offline mode, all we know is that we don't have this
110-
# file cached.
111-
return False
104+
105+
# Call HF to check if the file exists
106+
# 2 retries and exponential backoff
107+
max_retries = 2
108+
retry_delay = 2
109+
for attempt in range(max_retries):
110+
try:
111+
return file_exists(model,
112+
config_name,
113+
revision=revision,
114+
token=HF_TOKEN)
115+
except huggingface_hub.errors.OfflineModeIsEnabled:
116+
# Don't raise in offline mode,
117+
# all we know is that we don't have this
118+
# file cached.
119+
return False
120+
except Exception as e:
121+
logger.error(
122+
"Error checking file existence: %s, retrying %d of %d", e,
123+
attempt + 1, max_retries)
124+
if attempt == max_retries - 1:
125+
logger.error("Error checking file existence: %s", e)
126+
raise
127+
time.sleep(retry_delay)
128+
retry_delay *= 2
129+
continue
130+
return False
112131

113132

114133
def patch_rope_scaling(config: PretrainedConfig) -> None:
@@ -193,10 +212,26 @@ def get_config(
193212
# raise an offline mode error to indicate to the user that they
194213
# don't have files cached and may need to go online.
195214
# This is conveniently triggered by calling file_exists().
196-
file_exists(model,
197-
HF_CONFIG_NAME,
198-
revision=revision,
199-
token=HF_TOKEN)
215+
216+
# Call HF to check if the file exists
217+
# 2 retries and exponential backoff
218+
max_retries = 2
219+
retry_delay = 2
220+
for attempt in range(max_retries):
221+
try:
222+
file_exists(model,
223+
HF_CONFIG_NAME,
224+
revision=revision,
225+
token=HF_TOKEN)
226+
except Exception as e:
227+
logger.error(
228+
"Error checking file existence: %s, retrying %d of %d",
229+
e, attempt + 1, max_retries)
230+
if attempt == max_retries:
231+
logger.error("Error checking file existence: %s", e)
232+
raise e
233+
time.sleep(retry_delay)
234+
retry_delay *= 2
200235

201236
raise ValueError(f"No supported config format found in {model}")
202237

0 commit comments

Comments
 (0)