Skip to content

Commit

Permalink
fix: retry for model and bento upload failure (#4961)
Browse files Browse the repository at this point in the history
Signed-off-by: Frost Ming <me@frostming.com>
  • Loading branch information
frostming authored Sep 6, 2024
1 parent 72f38ce commit 1e8afb3
Show file tree
Hide file tree
Showing 2 changed files with 42 additions and 18 deletions.
5 changes: 5 additions & 0 deletions src/bentoml/_internal/cloud/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,12 @@ class CallbackIOWrapper(t.IO[bytes]):
end: int | None = None

def __attrs_post_init__(self) -> None:
self.reset()

def reset(self) -> int:
read = self.tell() - (self.start or 0)
self.file.seek(self.start or 0, 0)
return read

def seek(self, offset: int, whence: int = 0) -> int:
if whence == 2 and self.end is not None:
Expand Down
55 changes: 37 additions & 18 deletions src/bentoml/_internal/cloud/bentocloud.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,9 @@
from .schemas.schemasv1 import ModelWithRepositoryListSchema


UPLOAD_RETRY_COUNT = 3


class BentoCloudClient(CloudClient):
@inject
def push_bento(
Expand Down Expand Up @@ -319,16 +322,24 @@ def chunk_upload(
else None,
)

resp = httpx.put(
remote_bento.presigned_upload_url,
content=chunk_io,
timeout=36000,
)
if resp.status_code != 200:
return FinishUploadBentoSchema(
status=BentoUploadStatus.FAILED.value,
reason=resp.text,
for i in range(UPLOAD_RETRY_COUNT):
resp = httpx.put(
remote_bento.presigned_upload_url,
content=chunk_io,
timeout=36000,
)
if resp.status_code == 200:
break
if i == UPLOAD_RETRY_COUNT - 1:
return FinishUploadBentoSchema(
status=BentoUploadStatus.FAILED.value,
reason=resp.text,
)
else: # retry and reset and update progress
read = chunk_io.reset()
self.spinner.transmission_progress.update(
upload_task_id, advance=-read
)
return resp.headers["ETag"], chunk_number

futures_: list[
Expand Down Expand Up @@ -771,16 +782,24 @@ def chunk_upload(
else None,
)

resp = httpx.put(
remote_model.presigned_upload_url,
content=chunk_io,
timeout=36000,
)
if resp.status_code != 200:
return FinishUploadModelSchema(
status=ModelUploadStatus.FAILED.value,
reason=resp.text,
for i in range(UPLOAD_RETRY_COUNT):
resp = httpx.put(
remote_model.presigned_upload_url,
content=chunk_io,
timeout=36000,
)
if resp.status_code == 200:
break
if i == UPLOAD_RETRY_COUNT - 1:
return FinishUploadModelSchema(
status=ModelUploadStatus.FAILED.value,
reason=resp.text,
)
else: # retry and reset and update progress
read = chunk_io.reset()
self.spinner.transmission_progress.update(
upload_task_id, advance=-read
)
return resp.headers["ETag"], chunk_number

futures_: list[
Expand Down

0 comments on commit 1e8afb3

Please sign in to comment.