Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Handle backoff on HTTP 503 error when pushing repeatedly #1038

Merged
merged 6 commits into from
Sep 8, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 13 additions & 25 deletions src/huggingface_hub/file_download.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
import re
import sys
import tempfile
import time
import warnings
from contextlib import contextmanager
from functools import partial
Expand All @@ -20,6 +19,7 @@
import requests
from filelock import FileLock
from huggingface_hub import constants
from requests.exceptions import ConnectTimeout, ProxyError

from . import __version__
from .constants import (
Expand All @@ -37,6 +37,7 @@
EntryNotFoundError,
LocalEntryNotFoundError,
hf_raise_for_status,
http_backoff,
logging,
tqdm,
validate_hf_hub_args,
Expand Down Expand Up @@ -478,30 +479,17 @@ def _request_wrapper(
return response

# 3. Exponential backoff
tries, success = 0, False
while not success:
tries += 1
try:
response = requests.request(
method=method.upper(), url=url, timeout=timeout, **params
)
success = True
except (
requests.exceptions.ConnectTimeout,
requests.exceptions.ProxyError,
) as err:
if tries > max_retries:
raise err
else:
logger.info(
f"{method} request to {url} timed out, retrying..."
f" [{tries/max_retries}]"
)
sleep_time = min(
max_wait_time, base_wait_time * 2 ** (tries - 1)
) # Exponential backoff
time.sleep(sleep_time)
return response
return http_backoff(
method=method,
url=url,
max_retries=max_retries,
base_wait_time=base_wait_time,
max_wait_time=max_wait_time,
retry_on_exceptions=(ConnectTimeout, ProxyError),
retry_on_status_codes=(),
timeout=timeout,
**params,
)


def _request_with_retry(*args, **kwargs) -> requests.Response:
Expand Down
6 changes: 3 additions & 3 deletions src/huggingface_hub/lfs.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
from huggingface_hub.constants import ENDPOINT, REPO_TYPES_URL_PREFIXES
from requests.auth import HTTPBasicAuth

from .utils import hf_raise_for_status, validate_hf_hub_args
from .utils import hf_raise_for_status, http_backoff, validate_hf_hub_args
from .utils.sha import sha256, sha_fileobj


Expand Down Expand Up @@ -308,7 +308,7 @@ def _upload_single_part(upload_url: str, fileobj: BinaryIO):

Raises: `requests.HTTPError` if the upload resulted in an error
"""
upload_res = requests.put(upload_url, data=fileobj)
upload_res = http_backoff("PUT", upload_url, data=fileobj)
hf_raise_for_status(upload_res)
return upload_res

Expand Down Expand Up @@ -376,7 +376,7 @@ def _upload_multi_part(
seek_from=chunk_size * part_idx,
read_limit=chunk_size,
) as fileobj_slice:
part_upload_res = requests.put(part_upload_url, data=fileobj_slice)
part_upload_res = http_backoff("PUT", part_upload_url, data=fileobj_slice)
hf_raise_for_status(part_upload_res)
etag = part_upload_res.headers.get("etag")
if etag is None or etag == "":
Expand Down
1 change: 1 addition & 0 deletions src/huggingface_hub/utils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
RevisionNotFoundError,
hf_raise_for_status,
)
from ._http import http_backoff
from ._paths import filter_repo_objects
from ._subprocess import run_subprocess
from ._validators import HFValidationError, validate_hf_hub_args, validate_repo_id
Expand Down
131 changes: 131 additions & 0 deletions src/huggingface_hub/utils/_http.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,131 @@
# coding=utf-8
# Copyright 2022-present, the HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Contains utilities to handle HTTP requests in Huggingface Hub."""
import time
from http import HTTPStatus
from typing import Tuple, Type, Union

import requests
from requests import Response
from requests.exceptions import ConnectTimeout, ProxyError

from . import logging
from ._typing import Literal


logger = logging.get_logger(__name__)

HTTP_METHOD_T = Literal["GET", "OPTIONS", "HEAD", "POST", "PUT", "PATCH", "DELETE"]


def http_backoff(
method: HTTP_METHOD_T,
url: str,
*,
max_retries: int = 5,
base_wait_time: float = 1,
max_wait_time: float = 8,
retry_on_exceptions: Union[Type[Exception], Tuple[Type[Exception], ...]] = (
ConnectTimeout,
ProxyError,
),
retry_on_status_codes: Union[int, Tuple[int, ...]] = HTTPStatus.SERVICE_UNAVAILABLE,
**kwargs,
) -> Response:
"""Wrapper around requests to retry calls on an endpoint, with exponential backoff.

Endpoint call is retried on exceptions (ex: connection timeout, proxy error,...)
and/or on specific status codes (ex: service unavailable). If the call failed more
than `max_retries`, the exception is thrown or `raise_for_status` is called on the
response object.

Re-implement mechanisms from the `backoff` library to avoid adding an external
dependencies to `hugging_face_hub`. See https://github.com/litl/backoff.

Args:
method (`Literal["GET", "OPTIONS", "HEAD", "POST", "PUT", "PATCH", "DELETE"]`):
HTTP method to perform.
url (`str`):
The URL of the resource to fetch.
max_retries (`int`, *optional*, defaults to `5`):
Maximum number of retries, defaults to 5 (no retries).
base_wait_time (`float`, *optional*, defaults to `1`):
Duration (in seconds) to wait before retrying the first time.
Wait time between retries then grows exponentially, capped by
`max_wait_time`.
max_wait_time (`float`, *optional*, defaults to `8`):
Maximum duration (in seconds) to wait before retrying.
retry_on_exceptions (`Type[Exception]` or `Tuple[Type[Exception]]`, *optional*, defaults to `(ConnectTimeout, ProxyError,)`):
Define which exceptions must be caught to retry the request. Can be a single
type or a tuple of types.
By default, retry on `ConnectTimeout` and `ProxyError`.
retry_on_status_codes (`int` or `Tuple[int]`, *optional*, defaults to `503`):
Define on which status codes the request must be retried. By default, only
HTTP 503 Service Unavailable is retried.
**kwargs (`dict`, *optional*):
kwargs to pass to `requests.request`.

Example:
```
>>> from huggingface_hub.utils import http_backoff

# Same usage as "requests.request".
>>> response = http_backoff("GET", "https://www.google.com")
>>> response.raise_for_status()

# If you expect a Gateway Timeout from time to time
>>> http_backoff("PUT", upload_url, data=data, retry_on_status_codes=504)
>>> response.raise_for_status()
```
"""
if isinstance(retry_on_exceptions, type): # Tuple from single exception type
retry_on_exceptions = (retry_on_exceptions,)

if isinstance(retry_on_status_codes, int): # Tuple from single status code
retry_on_status_codes = (retry_on_status_codes,)

nb_tries = 0
sleep_time = base_wait_time
while True:
nb_tries += 1
try:
# Perform request and return if status_code is not in the retry list.
response = requests.request(method=method, url=url, **kwargs)
if response.status_code not in retry_on_status_codes:
return response

# Wrong status code returned (HTTP 503 for instance)
logger.warning(
f"HTTP Error {response.status_code} thrown while requesting"
f" {method} {url}"
)
if nb_tries > max_retries:
response.raise_for_status() # Will raise uncaught exception
# We return response to avoid infinite loop in the corner case where the
# user ask for retry on a status code that doesn't raise_for_status.
return response

except retry_on_exceptions as err:
logger.warning(f"'{err}' thrown while requesting {method} {url}")

if nb_tries > max_retries:
raise err

# Sleep for X seconds
logger.warning(f"Retrying in {sleep_time}s [{nb_tries/max_retries}].")
time.sleep(sleep_time)

# Update sleep time for next retry
sleep_time = min(max_wait_time, sleep_time * 2) # Exponential backoff
Wauplin marked this conversation as resolved.
Show resolved Hide resolved
127 changes: 127 additions & 0 deletions tests/test_utils_http.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
import time
import unittest
from typing import Generator
from unittest.mock import Mock, call, patch

from huggingface_hub.utils._http import http_backoff
from requests import ConnectTimeout, HTTPError


URL = "https://www.google.com"


@patch("huggingface_hub.utils._http.requests.request")
class TestHttpBackoff(unittest.TestCase):
def test_backoff_no_errors(self, mock_request: Mock) -> None:
"""Test normal usage of `http_backoff`."""
data_mock = Mock()
response = http_backoff("GET", URL, data=data_mock)
mock_request.assert_called_once_with(method="GET", url=URL, data=data_mock)
self.assertIs(response, mock_request())

def test_backoff_3_calls(self, mock_request: Mock) -> None:
"""Test `http_backoff` with 2 fails."""
response_mock = Mock()
mock_request.side_effect = (ValueError(), ValueError(), response_mock)
response = http_backoff( # retry on ValueError, instant retry
"GET", URL, retry_on_exceptions=ValueError, base_wait_time=0.0
)
self.assertEqual(mock_request.call_count, 3)
mock_request.assert_has_calls(
calls=[
call(method="GET", url=URL),
call(method="GET", url=URL),
call(method="GET", url=URL),
]
)
self.assertIs(response, response_mock)

def test_backoff_on_exception_until_max(self, mock_request: Mock) -> None:
"""Test `http_backoff` until max limit is reached with exceptions."""
mock_request.side_effect = ConnectTimeout()

with self.assertRaises(ConnectTimeout):
http_backoff("GET", URL, base_wait_time=0.0, max_retries=3)

self.assertEqual(mock_request.call_count, 4)

def test_backoff_on_status_code_until_max(self, mock_request: Mock) -> None:
"""Test `http_backoff` until max limit is reached with status codes."""
mock_503 = Mock()
mock_503.status_code = 503
mock_504 = Mock()
mock_504.status_code = 504
mock_504.raise_for_status.side_effect = HTTPError()
mock_request.side_effect = (mock_503, mock_504, mock_503, mock_504)

with self.assertRaises(HTTPError):
http_backoff(
"GET",
URL,
base_wait_time=0.0,
max_retries=3,
retry_on_status_codes=(503, 504),
)

self.assertEqual(mock_request.call_count, 4)

def test_backoff_on_exceptions_and_status_codes(self, mock_request: Mock) -> None:
"""Test `http_backoff` until max limit with status codes and exceptions."""
mock_503 = Mock()
mock_503.status_code = 503
mock_request.side_effect = (mock_503, ConnectTimeout())

with self.assertRaises(ConnectTimeout):
http_backoff("GET", URL, base_wait_time=0.0, max_retries=1)

self.assertEqual(mock_request.call_count, 2)

def test_backoff_on_valid_status_code(self, mock_request: Mock) -> None:
"""Test `http_backoff` until max limit with a valid status code.

Quite a corner case: the user wants to retry is status code is 200. Requests are
retried but in the end, the HTTP 200 response is returned if the server returned
only 200 responses.
"""
mock_200 = Mock()
mock_200.status_code = 200
mock_request.side_effect = (mock_200, mock_200, mock_200, mock_200)

response = http_backoff(
"GET", URL, base_wait_time=0.0, max_retries=3, retry_on_status_codes=200
)

self.assertEqual(mock_request.call_count, 4)
self.assertIs(response, mock_200)

def test_backoff_sleep_time(self, mock_request: Mock) -> None:
"""Test `http_backoff` sleep time goes exponential until max limit.

Since timing between 2 requests is sleep duration + some other stuff, this test
can be unstable. However, sleep durations between 10ms and 50ms should be enough
to make the approximation that measured durations are the "sleep time" waited by
`http_backoff`. If this is not the case, just increase `base_wait_time`,
`max_wait_time` and `expected_sleep_times` with bigger values.
"""
sleep_times = []

def _side_effect_timer() -> Generator[ConnectTimeout, None, None]:
t0 = time.time()
while True:
yield ConnectTimeout()
t1 = time.time()
sleep_times.append(round(t1 - t0, 2))
t0 = t1

mock_request.side_effect = _side_effect_timer()

with self.assertRaises(ConnectTimeout):
http_backoff(
"GET", URL, base_wait_time=0.01, max_wait_time=0.05, max_retries=5
)

self.assertEqual(mock_request.call_count, 6)

# Assert sleep times are exponential until plateau
expected_sleep_times = [0.01, 0.02, 0.04, 0.05, 0.05]
self.assertListEqual(sleep_times, expected_sleep_times)
4 changes: 2 additions & 2 deletions tests/testing_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,15 +128,15 @@ def offline(mode=OfflineSimulationMode.CONNECTION_FAILS, timeout=1e-16):
"""
Simulate offline mode.

There are three offline simulatiom modes:
There are three offline simulation modes:

CONNECTION_FAILS (default mode): a ConnectionError is raised for each network call.
Connection errors are created by mocking socket.socket
CONNECTION_TIMES_OUT: the connection hangs until it times out.
The default timeout value is low (1e-16) to speed up the tests.
Timeout errors are created by mocking requests.request
HF_HUB_OFFLINE_SET_TO_1: the HF_HUB_OFFLINE_SET_TO_1 environment variable is set to 1.
This makes the http/ftp calls of the library instantly fail and raise an OfflineModeEmabled error.
This makes the http/ftp calls of the library instantly fail and raise an OfflineModeEnabled error.
"""
import socket

Expand Down