Skip to content

Commit

Permalink
feat(client): add uploads.upload_file helper
Browse files Browse the repository at this point in the history
  • Loading branch information
RobertCraigie committed Aug 16, 2024
1 parent b143c16 commit aae079d
Show file tree
Hide file tree
Showing 3 changed files with 279 additions and 2 deletions.
10 changes: 10 additions & 0 deletions examples/generate_file.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
# generate a text file with random data for testing file uploads
wanted_size=$((1024*2048*512))
file_size=$(( ((wanted_size/12)+1)*12 ))
read_size=$((file_size*3/4))

echo "wanted=$wanted_size file=$file_size read=$read_size"

dd if=/dev/urandom bs=$read_size count=1 | base64 > /tmp/small_test_file.txt

truncate -s "$wanted_size" /tmp/big_test_file.txt
46 changes: 46 additions & 0 deletions examples/uploads.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
import sys
from pathlib import Path

import rich

from openai import OpenAI

# generate this file using `./generate_file.sh`
file = Path("/tmp/big_test_file.txt")

client = OpenAI()


def from_disk() -> None:
print("uploading file from disk")

upload = client.uploads.upload_file_chunked(
file=file,
mime_type="txt",
purpose="batch",
)
rich.print(upload)


def from_in_memory() -> None:
print("uploading file from memory")

# read the data into memory ourselves to simulate
# it coming from somewhere else
data = file.read_bytes()
filename = "my_file.txt"

upload = client.uploads.upload_file_chunked(
file=data,
filename=filename,
bytes=len(data),
mime_type="txt",
purpose="batch",
)
rich.print(upload)


if "memory" in sys.argv:
from_in_memory()
else:
from_disk()
225 changes: 223 additions & 2 deletions src/openai/resources/uploads/uploads.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,14 @@

from __future__ import annotations

from typing import List

import io
import os
import logging
import builtins
from typing import List, overload
from pathlib import Path

import anyio
import httpx

from ... import _legacy_response
Expand Down Expand Up @@ -31,6 +37,12 @@
__all__ = ["Uploads", "AsyncUploads"]


# 64MB
DEFAULT_PART_SIZE = 64 * 1024 * 1024

log: logging.Logger = logging.getLogger(__name__)


class Uploads(SyncAPIResource):
@cached_property
def parts(self) -> Parts:
Expand All @@ -44,6 +56,105 @@ def with_raw_response(self) -> UploadsWithRawResponse:
def with_streaming_response(self) -> UploadsWithStreamingResponse:
return UploadsWithStreamingResponse(self)

@overload
def upload_file_chunked(
self,
*,
file: os.PathLike[str],
mime_type: str,
purpose: FilePurpose,
bytes: int | None = None,
part_size: int | None = None,
md5: str | NotGiven = NOT_GIVEN,
) -> Upload:
"""Splits a file into multiple 64MB parts and uploads them sequentially."""

@overload
def upload_file_chunked(
self,
*,
file: bytes,
filename: str,
bytes: int,
mime_type: str,
purpose: FilePurpose,
part_size: int | None = None,
md5: str | NotGiven = NOT_GIVEN,
) -> Upload:
"""Splits an in-memory file into multiple 64MB parts and uploads them sequentially."""

def upload_file_chunked(
self,
*,
file: os.PathLike[str] | bytes,
mime_type: str,
purpose: FilePurpose,
filename: str | None = None,
bytes: int | None = None,
part_size: int | None = None,
md5: str | NotGiven = NOT_GIVEN,
) -> Upload:
"""Splits the given file into multiple parts and uploads them sequentially.
```py
from pathlib import Path
client.uploads.upload_file(
file=Path("my-paper.pdf"),
mime_type="pdf",
purpose="assistants",
)
```
"""
if isinstance(file, builtins.bytes):
if filename is None:
raise TypeError("The `filename` argument must be given for in-memory files")

if bytes is None:
raise TypeError("The `bytes` argument must be given for in-memory files")
else:
if not isinstance(file, Path):
file = Path(file)

if not filename:
filename = file.name

if bytes is None:
bytes = file.stat().st_size

upload = self.create(
bytes=bytes,
filename=filename,
mime_type=mime_type,
purpose=purpose,
)

part_ids: list[str] = []

if part_size is None:
part_size = DEFAULT_PART_SIZE

if isinstance(file, builtins.bytes):
buf: io.FileIO | io.BytesIO = io.BytesIO(file)
else:
buf = io.FileIO(file)

try:
while True:
data = buf.read(part_size)
if not data:
# EOF
break

part = self.parts.create(upload_id=upload.id, data=data)
log.info("Uploaded part %s for upload %s", part.id, upload.id)
part_ids.append(part.id)
except Exception:
buf.close()
raise

return self.complete(upload_id=upload.id, part_ids=part_ids, md5=md5)

def create(
self,
*,
Expand Down Expand Up @@ -227,6 +338,116 @@ def with_raw_response(self) -> AsyncUploadsWithRawResponse:
def with_streaming_response(self) -> AsyncUploadsWithStreamingResponse:
return AsyncUploadsWithStreamingResponse(self)

@overload
async def upload_file_chunked(
self,
*,
file: os.PathLike[str],
mime_type: str,
purpose: FilePurpose,
bytes: int | None = None,
part_size: int | None = None,
md5: str | NotGiven = NOT_GIVEN,
) -> Upload:
"""Splits a file into multiple 64MB parts and uploads them sequentially."""

@overload
async def upload_file_chunked(
self,
*,
file: bytes,
filename: str,
bytes: int,
mime_type: str,
purpose: FilePurpose,
part_size: int | None = None,
md5: str | NotGiven = NOT_GIVEN,
) -> Upload:
"""Splits an in-memory file into multiple 64MB parts and uploads them sequentially."""

async def upload_file_chunked(
self,
*,
file: os.PathLike[str] | bytes,
mime_type: str,
purpose: FilePurpose,
filename: str | None = None,
bytes: int | None = None,
part_size: int | None = None,
md5: str | NotGiven = NOT_GIVEN,
) -> Upload:
"""Splits the given file into multiple parts and uploads them sequentially.
```py
from pathlib import Path
client.uploads.upload_file(
file=Path("my-paper.pdf"),
mime_type="pdf",
purpose="assistants",
)
```
"""
if isinstance(file, builtins.bytes):
if filename is None:
raise TypeError("The `filename` argument must be given for in-memory files")

if bytes is None:
raise TypeError("The `bytes` argument must be given for in-memory files")
else:
if not isinstance(file, anyio.Path):
file = anyio.Path(file)

if not filename:
filename = file.name

if bytes is None:
stat = await file.stat()
bytes = stat.st_size

upload = await self.create(
bytes=bytes,
filename=filename,
mime_type=mime_type,
purpose=purpose,
)

part_ids: list[str] = []

if part_size is None:
part_size = DEFAULT_PART_SIZE

if isinstance(file, anyio.Path):
fd = await file.open("rb")
async with fd:
while True:
data = await fd.read(part_size)
if not data:
# EOF
break

part = await self.parts.create(upload_id=upload.id, data=data)
log.info("Uploaded part %s for upload %s", part.id, upload.id)
part_ids.append(part.id)
else:
buf = io.BytesIO(file)

try:
while True:
data = buf.read(part_size)
if not data:
# EOF
break

part = await self.parts.create(upload_id=upload.id, data=data)
log.info("Uploaded part %s for upload %s", part.id, upload.id)
part_ids.append(part.id)
except Exception:
buf.close()
raise

return await self.complete(upload_id=upload.id, part_ids=part_ids, md5=md5)

async def create(
self,
*,
Expand Down

0 comments on commit aae079d

Please sign in to comment.