⚡️ Speed up method S3DataSource.upload_file by 10%
#634
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
📄 10% (0.10x) speedup for
S3DataSource.upload_fileinbackend/python/app/sources/external/s3/s3.py⏱️ Runtime :
1.19 milliseconds→1.08 milliseconds(best of225runs)📝 Explanation and details
The optimization achieves a 10% runtime improvement by eliminating an unnecessary function call in the success path of the
upload_filemethod.Key optimization:
_handle_s3_response()call: The original code calledself._handle_s3_response(response)after every upload, butupload_filein aioboto3 returnsNoneon success and raises exceptions on failure. The optimized version directly returnsS3Response(success=True, data=None)when no exception occurs.Performance impact:
_handle_s3_response()call took 37.8% of total execution time in the original version (3.14854e+06 nanoseconds out of 8.33372 milliseconds)Throughput benefits:
Test case performance:
The optimization shows consistent improvements across all test scenarios, with the most benefit for:
The optimization maintains identical error handling behavior while streamlining the success path, making it ideal for production workloads with frequent S3 uploads.
✅ Correctness verification report:
🌀 Generated Regression Tests and Runtime
import asyncio # used to run async functions
Patch aioboto3 and ClientError for the S3DataSource import
import sys
import types
from typing import Any, Dict, Optional
import pytest # used for our unit tests
from app.sources.external.s3.s3 import S3DataSource
class DummyS3RESTClientViaAccessKey:
"""Stub for S3RESTClientViaAccessKey."""
def get_session(self):
return DummyAioboto3Session()
class DummyAioboto3Session:
"""Stub for aioboto3.Session."""
def client(self, service_name):
return DummyAsyncS3Client()
class DummyAsyncS3Client:
"""Stub for aioboto3 S3 client."""
async def aenter(self):
return self
class DummyClientError(Exception):
"""Stub for botocore.exceptions.ClientError."""
def init(self, response, operation_name):
self.response = response
self.operation_name = operation_name
--- End: Minimal stubs ---
--- Begin: S3Client and S3DataSource code (EXACT COPY from prompt) ---
class S3Client:
"""Builder class for S3 clients with different construction methods using aioboto3"""
--- End: S3Client and S3DataSource code ---
--- Begin: Unit tests ---
@pytest.fixture
def s3_data_source():
# Create S3DataSource with dummy client for all tests
client = S3Client(DummyS3RESTClientViaAccessKey())
return S3DataSource(client)
------------------ BASIC TEST CASES ------------------
@pytest.mark.asyncio
async def test_upload_file_basic_success(s3_data_source):
"""Basic: Test successful upload with minimal required parameters."""
result = await s3_data_source.upload_file(
Filename="testfile.txt",
Bucket="test-bucket",
Key="test-key"
)
@pytest.mark.asyncio
async def test_upload_file_basic_extra_args(s3_data_source):
"""Basic: Test upload with ExtraArgs parameter."""
result = await s3_data_source.upload_file(
Filename="testfile.txt",
Bucket="test-bucket",
Key="public-key",
ExtraArgs={"ACL": "public-read"}
)
@pytest.mark.asyncio
async def test_upload_file_basic_callback(s3_data_source):
"""Basic: Test upload with Callback parameter."""
called = []
def cb(arg):
called.append(arg)
result = await s3_data_source.upload_file(
Filename="testfile.txt",
Bucket="test-bucket",
Key="cb-key",
Callback=cb
)
@pytest.mark.asyncio
async def test_upload_file_basic_config(s3_data_source):
"""Basic: Test upload with Config parameter."""
class DummyConfig:
config_value = "dummy-config"
result = await s3_data_source.upload_file(
Filename="testfile.txt",
Bucket="test-bucket",
Key="cfg-key",
Config=DummyConfig()
)
------------------ EDGE TEST CASES ------------------
@pytest.mark.asyncio
async def test_upload_file_missing_parameters(s3_data_source):
"""Edge: Test upload with missing required parameters."""
# Missing Filename
result = await s3_data_source.upload_file(
Filename="",
Bucket="test-bucket",
Key="test-key"
)
# Missing Bucket
result = await s3_data_source.upload_file(
Filename="testfile.txt",
Bucket="",
Key="test-key"
)
# Missing Key
result = await s3_data_source.upload_file(
Filename="testfile.txt",
Bucket="test-bucket",
Key=""
)
@pytest.mark.asyncio
async def test_upload_file_client_error(s3_data_source):
"""Edge: Simulate ClientError exception from underlying client."""
result = await s3_data_source.upload_file(
Filename="raise_clienterror",
Bucket="test-bucket",
Key="test-key"
)
@pytest.mark.asyncio
async def test_upload_file_generic_exception(s3_data_source):
"""Edge: Simulate generic Exception from underlying client."""
result = await s3_data_source.upload_file(
Filename="raise_exception",
Bucket="test-bucket",
Key="test-key"
)
@pytest.mark.asyncio
async def test_upload_file_empty_file(s3_data_source):
"""Edge: Simulate uploading an empty file."""
result = await s3_data_source.upload_file(
Filename="empty_file",
Bucket="test-bucket",
Key="empty-key"
)
@pytest.mark.asyncio
async def test_upload_file_concurrent_execution(s3_data_source):
"""Edge: Test concurrent uploads with different parameters."""
async def upload_one(idx):
fname = f"file_{idx}.txt"
return await s3_data_source.upload_file(
Filename=fname,
Bucket="test-bucket",
Key=f"key_{idx}"
)
# Run 10 concurrent uploads
results = await asyncio.gather(*(upload_one(i) for i in range(10)))
for i, result in enumerate(results):
pass
------------------ LARGE SCALE TEST CASES ------------------
@pytest.mark.asyncio
async def test_upload_file_large_file(s3_data_source):
"""Large scale: Simulate uploading a large file."""
result = await s3_data_source.upload_file(
Filename="large_file",
Bucket="test-bucket",
Key="large-key"
)
@pytest.mark.asyncio
async def test_upload_file_many_concurrent(s3_data_source):
"""Large scale: Test many concurrent uploads (up to 50)."""
async def upload_one(idx):
fname = f"file_{idx}.txt"
return await s3_data_source.upload_file(
Filename=fname,
Bucket="test-bucket",
Key=f"key_{idx}"
)
# Run 50 concurrent uploads
results = await asyncio.gather(*(upload_one(i) for i in range(50)))
for i, result in enumerate(results):
pass
------------------ THROUGHPUT TEST CASES ------------------
@pytest.mark.asyncio
async def test_upload_file_throughput_small_load(s3_data_source):
"""Throughput: Test upload throughput with small load (10 files)."""
async def upload_one(idx):
return await s3_data_source.upload_file(
Filename=f"small_{idx}.txt",
Bucket="throughput-bucket",
Key=f"small_key_{idx}"
)
results = await asyncio.gather(*(upload_one(i) for i in range(10)))
@pytest.mark.asyncio
async def test_upload_file_throughput_medium_load(s3_data_source):
"""Throughput: Test upload throughput with medium load (50 files)."""
async def upload_one(idx):
return await s3_data_source.upload_file(
Filename=f"medium_{idx}.txt",
Bucket="throughput-bucket",
Key=f"medium_key_{idx}"
)
results = await asyncio.gather(*(upload_one(i) for i in range(50)))
@pytest.mark.asyncio
async def test_upload_file_throughput_large_load(s3_data_source):
"""Throughput: Test upload throughput with large load (100 files)."""
async def upload_one(idx):
return await s3_data_source.upload_file(
Filename=f"large_{idx}.txt",
Bucket="throughput-bucket",
Key=f"large_key_{idx}"
)
results = await asyncio.gather(*(upload_one(i) for i in range(100)))
@pytest.mark.asyncio
async def test_upload_file_throughput_mixed_load(s3_data_source):
"""Throughput: Test upload throughput with mixed valid and invalid files."""
async def upload_one(idx):
if idx % 10 == 0:
# Simulate missing parameter
return await s3_data_source.upload_file(
Filename="",
Bucket="throughput-bucket",
Key=f"mixed_key_{idx}"
)
else:
return await s3_data_source.upload_file(
Filename=f"mixed_{idx}.txt",
Bucket="throughput-bucket",
Key=f"mixed_key_{idx}"
)
results = await asyncio.gather(*(upload_one(i) for i in range(30)))
for i, result in enumerate(results):
if i % 10 == 0:
pass
else:
pass
codeflash_output is used to check that the output of the original code is the same as that of the optimized code.
#------------------------------------------------
import asyncio # used to run async functions
---- Patch aioboto3 and ClientError in S3DataSource for testing ----
import sys
from typing import Any, Dict, Optional
import pytest # used for our unit tests
from app.sources.external.s3.s3 import S3DataSource
class DummyS3RESTClientViaAccessKey:
"""Dummy client to simulate aioboto3 session"""
def get_session(self):
return DummyAioboto3Session()
class S3Client:
def init(self, client: Any) -> None:
self.client = client
class DummyClientError(Exception):
def init(self, response):
self.response = response
class DummyAioboto3Session:
"""Simulates aioboto3.Session.client('s3') as an async context manager"""
def client(self, service_name):
return DummyAsyncS3Client()
class DummyAsyncS3Client:
"""Simulates aioboto3 S3 client with async upload_file method"""
async def aenter(self):
return self
---- Fixtures ----
@pytest.fixture
def s3_data_source():
"""Provides an S3DataSource instance with dummy S3Client."""
client = S3Client(DummyS3RESTClientViaAccessKey())
return S3DataSource(client)
---- Basic Test Cases ----
@pytest.mark.asyncio
async def test_upload_file_basic_success(s3_data_source):
"""Test basic successful upload."""
response = await s3_data_source.upload_file(
Filename="valid.txt",
Bucket="my-bucket",
Key="my-key"
)
@pytest.mark.asyncio
async def test_upload_file_basic_error_response(s3_data_source):
"""Test upload_file returns error when S3 returns error dict."""
response = await s3_data_source.upload_file(
Filename="error.txt",
Bucket="bad-bucket",
Key="bad-key"
)
@pytest.mark.asyncio
async def test_upload_file_basic_empty_response(s3_data_source):
"""Test upload_file handles None response from S3."""
response = await s3_data_source.upload_file(
Filename="empty.txt",
Bucket="my-bucket",
Key="my-key"
)
@pytest.mark.asyncio
async def test_upload_file_basic_extra_args(s3_data_source):
"""Test upload_file with ExtraArgs parameter."""
response = await s3_data_source.upload_file(
Filename="valid.txt",
Bucket="my-bucket",
Key="my-key",
ExtraArgs={"ACL": "public-read"}
)
@pytest.mark.asyncio
async def test_upload_file_basic_callback(s3_data_source):
"""Test upload_file with Callback parameter."""
response = await s3_data_source.upload_file(
Filename="valid.txt",
Bucket="my-bucket",
Key="my-key",
Callback=lambda x: x
)
@pytest.mark.asyncio
async def test_upload_file_basic_config(s3_data_source):
"""Test upload_file with Config parameter."""
response = await s3_data_source.upload_file(
Filename="valid.txt",
Bucket="my-bucket",
Key="my-key",
Config={"multipart_threshold": 8 * 1024 * 1024}
)
---- Edge Test Cases ----
@pytest.mark.asyncio
async def test_upload_file_edge_client_error(s3_data_source):
"""Test upload_file raises and handles ClientError."""
response = await s3_data_source.upload_file(
Filename="raise_client_error.txt",
Bucket="my-bucket",
Key="my-key"
)
@pytest.mark.asyncio
async def test_upload_file_edge_generic_exception(s3_data_source):
"""Test upload_file raises and handles generic Exception."""
response = await s3_data_source.upload_file(
Filename="raise_exception.txt",
Bucket="my-bucket",
Key="my-key"
)
@pytest.mark.asyncio
async def test_upload_file_edge_concurrent_execution(s3_data_source):
"""Test concurrent execution of upload_file with different keys."""
filenames = [f"valid.txt"] * 5
buckets = [f"my-bucket"] * 5
keys = [f"key-{i}" for i in range(5)]
coros = [
s3_data_source.upload_file(
Filename=fn, Bucket=bucket, Key=key
) for fn, bucket, key in zip(filenames, buckets, keys)
]
responses = await asyncio.gather(*coros)
for i, response in enumerate(responses):
pass
@pytest.mark.asyncio
async def test_upload_file_edge_invalid_parameters(s3_data_source):
"""Test upload_file with missing optional parameters (should succeed)."""
response = await s3_data_source.upload_file(
Filename="valid.txt",
Bucket="my-bucket",
Key="my-key"
)
---- Large Scale Test Cases ----
@pytest.mark.asyncio
async def test_upload_file_large_scale_concurrent_uploads(s3_data_source):
"""Test large scale concurrent uploads (up to 50)."""
num_uploads = 50
coros = [
s3_data_source.upload_file(
Filename="valid.txt",
Bucket="my-bucket",
Key=f"bulk-key-{i}"
) for i in range(num_uploads)
]
responses = await asyncio.gather(*coros)
# All should succeed and have correct keys
for i, response in enumerate(responses):
pass
@pytest.mark.asyncio
async def test_upload_file_large_scale_mixed_results(s3_data_source):
"""Test large scale with mixed success/error responses."""
filenames = ["valid.txt", "error.txt", "raise_client_error.txt", "raise_exception.txt", "empty.txt"] * 10
coros = [
s3_data_source.upload_file(
Filename=fn,
Bucket="my-bucket",
Key=f"mixed-key-{i}"
) for i, fn in enumerate(filenames)
]
responses = await asyncio.gather(*coros)
# Check correct error/success for each type
for i, response in enumerate(responses):
fn = filenames[i]
if fn == "valid.txt":
pass
elif fn == "error.txt":
pass
elif fn == "raise_client_error.txt":
pass
elif fn == "raise_exception.txt":
pass
elif fn == "empty.txt":
pass
---- Throughput Test Cases ----
@pytest.mark.asyncio
async def test_upload_file_throughput_small_load(s3_data_source):
"""Throughput test: small load (10 concurrent uploads)."""
coros = [
s3_data_source.upload_file(
Filename="valid.txt",
Bucket="my-bucket",
Key=f"throughput-small-{i}"
) for i in range(10)
]
responses = await asyncio.gather(*coros)
for i, r in enumerate(responses):
pass
@pytest.mark.asyncio
async def test_upload_file_throughput_medium_load(s3_data_source):
"""Throughput test: medium load (50 concurrent uploads)."""
coros = [
s3_data_source.upload_file(
Filename="valid.txt",
Bucket="my-bucket",
Key=f"throughput-medium-{i}"
) for i in range(50)
]
responses = await asyncio.gather(*coros)
for i, r in enumerate(responses):
pass
@pytest.mark.asyncio
async def test_upload_file_throughput_large_file(s3_data_source):
"""Throughput test: single large file upload."""
response = await s3_data_source.upload_file(
Filename="largefile.txt",
Bucket="my-bucket",
Key="large-key"
)
@pytest.mark.asyncio
async def test_upload_file_throughput_high_volume_mixed(s3_data_source):
"""Throughput test: high volume with mixed file types (100 uploads)."""
filenames = ["valid.txt", "error.txt", "largefile.txt"] * 33 + ["valid.txt"]
coros = [
s3_data_source.upload_file(
Filename=fn,
Bucket="my-bucket",
Key=f"throughput-high-{i}"
) for i, fn in enumerate(filenames)
]
responses = await asyncio.gather(*coros)
for i, response in enumerate(responses):
fn = filenames[i]
if fn == "valid.txt":
pass
elif fn == "error.txt":
pass
elif fn == "largefile.txt":
pass
codeflash_output is used to check that the output of the original code is the same as that of the optimized code.
To edit these changes
git checkout codeflash/optimize-S3DataSource.upload_file-mhxcx3pgand push.