Skip to content

Commit 47663ec

Browse files
committed
Extract the ability to get file size via http HEAD request into its own component
1 parent 0b498b7 commit 47663ec

File tree

4 files changed

+245
-29
lines changed

4 files changed

+245
-29
lines changed

simple_repository_browser/_app.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
from simple_repository.components.local import LocalRepository
1515

1616
from . import controller, crawler, errors, fetch_projects, model, view
17+
from .filesize_enrichment import FileSizeEnrichmentRepository
1718
from .metadata_injector import MetadataInjector
1819
from .static_files import generate_manifest
1920

@@ -141,9 +142,11 @@ def _repo_from_url(
141142
def create_model(
142143
self, http_client: httpx.AsyncClient, database: aiosqlite.Connection
143144
) -> model.Model:
144-
source = MetadataInjector(
145-
self._repo_from_url(self.repository_url, http_client=http_client),
145+
base_repo = self._repo_from_url(self.repository_url, http_client=http_client)
146+
source = FileSizeEnrichmentRepository(
147+
MetadataInjector(base_repo, http_client=http_client),
146148
http_client=http_client,
149+
max_concurrent_requests=10,
147150
)
148151
return model.Model(
149152
source=source,

simple_repository_browser/fetch_description.py

Lines changed: 2 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,3 @@
1-
import asyncio
21
import dataclasses
32
import datetime
43
import email.parser
@@ -144,35 +143,11 @@ async def package_info(
144143

145144
files_info: typing.Dict[str, FileInfo] = {}
146145

147-
# Get the size from the repository, if possible.
146+
# Get the size from the repository files
148147
for file in files:
149148
if file.size:
150149
files_info[file.filename] = FileInfo(
151-
size=file.size,
152-
)
153-
154-
limited_concurrency = asyncio.Semaphore(10)
155-
# Compute the size of each file.
156-
# TODO: This should be done as part of the repository component interface.
157-
async with httpx.AsyncClient(verify=False) as http_client:
158-
159-
async def semaphored_head(filename: str, url: str):
160-
async with limited_concurrency:
161-
headers: dict[str, str] = {}
162-
return (
163-
filename,
164-
await http_client.head(url, follow_redirects=True, headers=headers),
165-
)
166-
167-
coros = [
168-
semaphored_head(file.filename, file.url)
169-
for file in files
170-
if file.filename not in files_info
171-
]
172-
for coro in asyncio.as_completed(coros):
173-
filename, response = await coro
174-
files_info[filename] = FileInfo(
175-
size=int(response.headers["Content-Length"]),
150+
size=file.size or 0,
176151
)
177152

178153
file = files[0]
Lines changed: 144 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,144 @@
1+
"""
2+
FileSizeEnrichmentRepository component for adding file size information to project pages.
3+
4+
This component wraps another repository and automatically enriches file metadata
5+
with size information by making HTTP HEAD requests to files that don't already
6+
have size information.
7+
"""
8+
9+
import asyncio
10+
from dataclasses import replace
11+
import logging
12+
import typing
13+
14+
import httpx
15+
from simple_repository import SimpleRepository, model
16+
from simple_repository.components.core import RepositoryContainer
17+
18+
from ._typing_compat import override
19+
20+
logger = logging.getLogger(__name__)
21+
22+
23+
class FileSizeEnrichmentRepository(RepositoryContainer):
24+
"""
25+
Repository component that enriches file metadata with size information.
26+
27+
This component automatically adds size information to files that don't already
28+
have it by making HTTP HEAD requests. It maintains parallelism for efficiency
29+
while respecting concurrency limits.
30+
"""
31+
32+
def __init__(
33+
self,
34+
source: SimpleRepository,
35+
http_client: httpx.AsyncClient,
36+
*,
37+
max_concurrent_requests: int = 10,
38+
) -> None:
39+
"""
40+
Initialize the FileSizeEnrichmentRepository.
41+
42+
Args:
43+
source: The underlying repository to wrap
44+
http_client: HTTP client for making HEAD requests
45+
max_concurrent_requests: Maximum number of concurrent HEAD requests
46+
"""
47+
super().__init__(source)
48+
self.http_client = http_client
49+
self.semaphore = asyncio.Semaphore(max_concurrent_requests)
50+
51+
@override
52+
async def get_project_page(
53+
self,
54+
project_name: str,
55+
*,
56+
request_context: typing.Optional[model.RequestContext] = None,
57+
) -> model.ProjectDetail:
58+
"""
59+
Get project page with file sizes enriched.
60+
61+
Files that don't have size information will have their sizes fetched
62+
via HTTP HEAD requests in parallel.
63+
"""
64+
project_page = await super().get_project_page(
65+
project_name, request_context=request_context
66+
)
67+
68+
# Identify files that need size information
69+
files_needing_size = [
70+
file for file in project_page.files if not file.size and file.url
71+
]
72+
73+
if not files_needing_size:
74+
# No files need size information, return as-is
75+
return project_page
76+
77+
# Fetch sizes for files that need them
78+
size_info = await self._fetch_file_sizes(files_needing_size)
79+
80+
# Create new files with updated size information
81+
enriched_files = []
82+
for file in project_page.files:
83+
if file.filename in size_info:
84+
file = replace(file, size=size_info[file.filename])
85+
enriched_files.append(file)
86+
87+
return replace(project_page, files=tuple(enriched_files))
88+
89+
async def _fetch_file_sizes(
90+
self, files: typing.List[model.File]
91+
) -> typing.Dict[str, int]:
92+
"""
93+
Fetch file sizes for multiple files in parallel.
94+
95+
Args:
96+
files: List of files to fetch sizes for
97+
98+
Returns:
99+
Dictionary mapping filename to size in bytes
100+
"""
101+
102+
async def fetch_single_file_size(
103+
file: model.File,
104+
) -> typing.Tuple[str, typing.Optional[int]]:
105+
"""Fetch size for a single file with semaphore protection."""
106+
async with self.semaphore:
107+
try:
108+
logger.debug(f"Fetching size for {file.filename} from {file.url}")
109+
110+
# Make HEAD request to get Content-Length
111+
response = await self.http_client.head(
112+
file.url, follow_redirects=True, headers={}
113+
)
114+
response.raise_for_status()
115+
116+
content_length = response.headers.get("Content-Length")
117+
if content_length:
118+
return file.filename, int(content_length)
119+
else:
120+
logger.warning(f"No Content-Length header for {file.filename}")
121+
return file.filename, None
122+
123+
except Exception as e:
124+
logger.warning(f"Failed to get size for {file.filename}: {e}")
125+
return file.filename, None
126+
127+
# Create tasks for all files
128+
tasks = [fetch_single_file_size(file) for file in files]
129+
130+
# Wait for all tasks to complete
131+
results = await asyncio.gather(*tasks, return_exceptions=True)
132+
133+
# Process results, filtering out failures
134+
size_info = {}
135+
for result in results:
136+
if isinstance(result, Exception):
137+
logger.warning(f"Exception occurred during size fetching: {result}")
138+
continue
139+
140+
filename, size = result
141+
if size is not None:
142+
size_info[filename] = size
143+
144+
return size_info
Lines changed: 94 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,94 @@
1+
import typing
2+
from unittest.mock import AsyncMock, MagicMock
3+
4+
import pytest
5+
from simple_repository import SimpleRepository, model
6+
import simple_repository.errors
7+
8+
from .._typing_compat import override
9+
from ..filesize_enrichment import FileSizeEnrichmentRepository
10+
11+
12+
class FakeRepository(SimpleRepository):
13+
def __init__(self) -> None:
14+
self.project_pages: dict[str, model.ProjectDetail] = {}
15+
16+
@override
17+
async def get_project_page(
18+
self,
19+
project_name: str,
20+
*,
21+
request_context: typing.Optional[model.RequestContext] = None,
22+
) -> model.ProjectDetail:
23+
try:
24+
return self.project_pages[project_name]
25+
except KeyError:
26+
raise simple_repository.errors.PackageNotFoundError(project_name)
27+
28+
29+
@pytest.mark.asyncio
30+
async def test_successful_size_enrichment() -> None:
31+
"""Test successful enrichment of file sizes."""
32+
project_page = model.ProjectDetail(
33+
meta=model.Meta("1.0"),
34+
name="test-project",
35+
files=(
36+
model.File("test-1.0.whl", "http://example.com/test-1.0.whl", {}),
37+
model.File("test-1.0.tar.gz", "http://example.com/test-1.0.tar.gz", {}),
38+
model.File("test-1.1.tar.gz", "http://example.com/test-1.1.tar.gz", {}),
39+
model.File("test-1.2.tar.gz", "http://example.com/test-1.2.tar.gz", {}),
40+
model.File("test-1.3.tar.gz", "http://example.com/test-1.3.tar.gz", {}),
41+
model.File("test-1.4.tar.gz", "http://example.com/test-1.4.tar.gz", {}),
42+
model.File("test-1.5.tar.gz", "http://example.com/test-1.5.tar.gz", {}),
43+
),
44+
)
45+
fake_repository = FakeRepository()
46+
fake_repository.project_pages["test-project"] = project_page
47+
48+
# Create mock HTTP client that returns Content-Length headers
49+
mock_http_client = MagicMock()
50+
51+
async def mock_head(url: str, **kwargs):
52+
"""Mock HEAD request that returns Content-Length based on filename."""
53+
response = MagicMock()
54+
response.raise_for_status.return_value = None
55+
56+
# Return different sizes based on URL
57+
if "test-1.0.whl" in url:
58+
response.headers = {"Content-Length": "1024"}
59+
elif "test-1.0.tar.gz" in url:
60+
response.headers = {"Content-Length": "2048"}
61+
elif "test-1.1.tar.gz" in url:
62+
response.headers = {"Content-Length": "3072"}
63+
elif "test-1.2.tar.gz" in url:
64+
response.headers = {"Content-Length": "4096"}
65+
elif "test-1.3.tar.gz" in url:
66+
response.headers = {"Content-Length": "5120"}
67+
elif "test-1.4.tar.gz" in url:
68+
response.headers = {"Content-Length": "6144"}
69+
elif "test-1.5.tar.gz" in url:
70+
response.headers = {"Content-Length": "7168"}
71+
else:
72+
response.headers = {"Content-Length": "1000"}
73+
74+
return response
75+
76+
mock_http_client.head = AsyncMock(side_effect=mock_head)
77+
78+
# Create enrichment repository
79+
enrichment_repo = FileSizeEnrichmentRepository(
80+
source=fake_repository,
81+
http_client=mock_http_client,
82+
max_concurrent_requests=3,
83+
)
84+
85+
# Test that sizes are enriched
86+
result = await enrichment_repo.get_project_page("test-project")
87+
88+
# Check that all files have the expected sizes
89+
expected_sizes = [1024, 2048, 3072, 4096, 5120, 6144, 7168]
90+
for i, file in enumerate(result.files):
91+
assert file.size == expected_sizes[i]
92+
93+
# Verify that HEAD requests were made for all files
94+
assert mock_http_client.head.call_count == 7

0 commit comments

Comments
 (0)