Skip to content

Commit 27046e7

Browse files
authored
Merge pull request #18 from simple-repository/feature/http_head_file_size
Extract the ability to get file size via http HEAD request into its own component
2 parents 0b498b7 + 2771ff0 commit 27046e7

File tree

4 files changed

+248
-29
lines changed

4 files changed

+248
-29
lines changed

simple_repository_browser/_app.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
from simple_repository.components.local import LocalRepository
1515

1616
from . import controller, crawler, errors, fetch_projects, model, view
17+
from .filesize_enrichment import FileSizeEnrichmentRepository
1718
from .metadata_injector import MetadataInjector
1819
from .static_files import generate_manifest
1920

@@ -141,9 +142,11 @@ def _repo_from_url(
141142
def create_model(
142143
self, http_client: httpx.AsyncClient, database: aiosqlite.Connection
143144
) -> model.Model:
144-
source = MetadataInjector(
145-
self._repo_from_url(self.repository_url, http_client=http_client),
145+
base_repo = self._repo_from_url(self.repository_url, http_client=http_client)
146+
source = FileSizeEnrichmentRepository(
147+
MetadataInjector(base_repo, http_client=http_client),
146148
http_client=http_client,
149+
max_concurrent_requests=10,
147150
)
148151
return model.Model(
149152
source=source,

simple_repository_browser/fetch_description.py

Lines changed: 2 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,3 @@
1-
import asyncio
21
import dataclasses
32
import datetime
43
import email.parser
@@ -144,35 +143,11 @@ async def package_info(
144143

145144
files_info: typing.Dict[str, FileInfo] = {}
146145

147-
# Get the size from the repository, if possible.
146+
# Get the size from the repository files
148147
for file in files:
149148
if file.size:
150149
files_info[file.filename] = FileInfo(
151-
size=file.size,
152-
)
153-
154-
limited_concurrency = asyncio.Semaphore(10)
155-
# Compute the size of each file.
156-
# TODO: This should be done as part of the repository component interface.
157-
async with httpx.AsyncClient(verify=False) as http_client:
158-
159-
async def semaphored_head(filename: str, url: str):
160-
async with limited_concurrency:
161-
headers: dict[str, str] = {}
162-
return (
163-
filename,
164-
await http_client.head(url, follow_redirects=True, headers=headers),
165-
)
166-
167-
coros = [
168-
semaphored_head(file.filename, file.url)
169-
for file in files
170-
if file.filename not in files_info
171-
]
172-
for coro in asyncio.as_completed(coros):
173-
filename, response = await coro
174-
files_info[filename] = FileInfo(
175-
size=int(response.headers["Content-Length"]),
150+
size=file.size or 0,
176151
)
177152

178153
file = files[0]
Lines changed: 147 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,147 @@
1+
"""
2+
FileSizeEnrichmentRepository component for adding file size information to project pages.
3+
4+
This component wraps another repository and automatically enriches file metadata
5+
with size information by making HTTP HEAD requests to files that don't already
6+
have size information.
7+
"""
8+
9+
import asyncio
10+
from dataclasses import replace
11+
import logging
12+
import typing
13+
14+
import httpx
15+
from simple_repository import SimpleRepository, model
16+
from simple_repository.components.core import RepositoryContainer
17+
18+
from ._typing_compat import override
19+
20+
logger = logging.getLogger(__name__)
21+
22+
23+
class FileSizeEnrichmentRepository(RepositoryContainer):
24+
"""
25+
Repository component that enriches file metadata with size information.
26+
27+
This component automatically adds size information to files that don't already
28+
have it by making HTTP HEAD requests. It maintains parallelism for efficiency
29+
while respecting concurrency limits.
30+
"""
31+
32+
def __init__(
33+
self,
34+
source: SimpleRepository,
35+
http_client: httpx.AsyncClient,
36+
*,
37+
max_concurrent_requests: int = 10,
38+
) -> None:
39+
"""
40+
Initialize the FileSizeEnrichmentRepository.
41+
42+
Parameters
43+
----------
44+
source: The underlying repository to wrap
45+
46+
http_client: HTTP client for making HEAD requests
47+
48+
max_concurrent_requests: Maximum number of concurrent HEAD requests
49+
"""
50+
super().__init__(source)
51+
self.http_client = http_client
52+
self.semaphore = asyncio.Semaphore(max_concurrent_requests)
53+
54+
@override
55+
async def get_project_page(
56+
self,
57+
project_name: str,
58+
*,
59+
request_context: model.RequestContext = model.RequestContext.DEFAULT,
60+
) -> model.ProjectDetail:
61+
"""
62+
Get project page with file sizes enriched.
63+
64+
Files that don't have size information will have their sizes fetched
65+
via HTTP HEAD requests in parallel.
66+
"""
67+
project_page = await super().get_project_page(
68+
project_name, request_context=request_context
69+
)
70+
71+
# Identify files that need size information
72+
files_needing_size = [
73+
file for file in project_page.files if not file.size and file.url
74+
]
75+
76+
if not files_needing_size:
77+
# No files need size information, return as-is
78+
return project_page
79+
80+
# Fetch sizes for files that need them
81+
size_info = await self._fetch_file_sizes(files_needing_size)
82+
83+
# Create new files with updated size information
84+
enriched_files = []
85+
for file in project_page.files:
86+
if file.filename in size_info:
87+
file = replace(file, size=size_info[file.filename])
88+
enriched_files.append(file)
89+
90+
return replace(project_page, files=tuple(enriched_files))
91+
92+
async def _fetch_file_sizes(
93+
self, files: typing.List[model.File]
94+
) -> typing.Dict[str, int]:
95+
"""
96+
Fetch file sizes for multiple files in parallel.
97+
98+
Args:
99+
files: List of files to fetch sizes for
100+
101+
Returns:
102+
Dictionary mapping filename to size in bytes
103+
"""
104+
105+
async def fetch_single_file_size(
106+
file: model.File,
107+
) -> typing.Tuple[str, typing.Optional[int]]:
108+
"""Fetch size for a single file with semaphore protection."""
109+
async with self.semaphore:
110+
try:
111+
logger.debug(f"Fetching size for {file.filename} from {file.url}")
112+
113+
# Make HEAD request to get Content-Length
114+
response = await self.http_client.head(
115+
file.url, follow_redirects=True, headers={}
116+
)
117+
response.raise_for_status()
118+
119+
content_length = response.headers.get("Content-Length")
120+
if content_length:
121+
return file.filename, int(content_length)
122+
else:
123+
logger.warning(f"No Content-Length header for {file.filename}")
124+
return file.filename, None
125+
126+
except BaseException as e:
127+
logger.warning(f"Failed to get size for {file.filename}: {e}")
128+
return file.filename, None
129+
130+
# Create tasks for all files
131+
tasks = [fetch_single_file_size(file) for file in files]
132+
133+
# Wait for all tasks to complete
134+
results = await asyncio.gather(*tasks, return_exceptions=True)
135+
136+
# Process results, filtering out failures
137+
size_info = {}
138+
for result in results:
139+
if isinstance(result, BaseException):
140+
logger.warning(f"Exception occurred during size fetching: {result}")
141+
continue
142+
143+
filename, size = result
144+
if size is not None:
145+
size_info[filename] = size
146+
147+
return size_info
Lines changed: 94 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,94 @@
1+
import typing
2+
from unittest.mock import AsyncMock, MagicMock
3+
4+
import pytest
5+
from simple_repository import SimpleRepository, model
6+
import simple_repository.errors
7+
8+
from .._typing_compat import override
9+
from ..filesize_enrichment import FileSizeEnrichmentRepository
10+
11+
12+
class FakeRepository(SimpleRepository):
13+
def __init__(self) -> None:
14+
self.project_pages: dict[str, model.ProjectDetail] = {}
15+
16+
@override
17+
async def get_project_page(
18+
self,
19+
project_name: str,
20+
*,
21+
request_context: typing.Optional[model.RequestContext] = None,
22+
) -> model.ProjectDetail:
23+
try:
24+
return self.project_pages[project_name]
25+
except KeyError:
26+
raise simple_repository.errors.PackageNotFoundError(project_name)
27+
28+
29+
@pytest.mark.asyncio
30+
async def test_successful_size_enrichment() -> None:
31+
"""Test successful enrichment of file sizes."""
32+
project_page = model.ProjectDetail(
33+
meta=model.Meta("1.0"),
34+
name="test-project",
35+
files=(
36+
model.File("test-1.0.whl", "http://example.com/test-1.0.whl", {}),
37+
model.File("test-1.0.tar.gz", "http://example.com/test-1.0.tar.gz", {}),
38+
model.File("test-1.1.tar.gz", "http://example.com/test-1.1.tar.gz", {}),
39+
model.File("test-1.2.tar.gz", "http://example.com/test-1.2.tar.gz", {}),
40+
model.File("test-1.3.tar.gz", "http://example.com/test-1.3.tar.gz", {}),
41+
model.File("test-1.4.tar.gz", "http://example.com/test-1.4.tar.gz", {}),
42+
model.File("test-1.5.tar.gz", "http://example.com/test-1.5.tar.gz", {}),
43+
),
44+
)
45+
fake_repository = FakeRepository()
46+
fake_repository.project_pages["test-project"] = project_page
47+
48+
# Create mock HTTP client that returns Content-Length headers
49+
mock_http_client = MagicMock()
50+
51+
async def mock_head(url: str, **kwargs):
52+
"""Mock HEAD request that returns Content-Length based on filename."""
53+
response = MagicMock()
54+
response.raise_for_status.return_value = None
55+
56+
# Return different sizes based on URL
57+
if "test-1.0.whl" in url:
58+
response.headers = {"Content-Length": "1024"}
59+
elif "test-1.0.tar.gz" in url:
60+
response.headers = {"Content-Length": "2048"}
61+
elif "test-1.1.tar.gz" in url:
62+
response.headers = {"Content-Length": "3072"}
63+
elif "test-1.2.tar.gz" in url:
64+
response.headers = {"Content-Length": "4096"}
65+
elif "test-1.3.tar.gz" in url:
66+
response.headers = {"Content-Length": "5120"}
67+
elif "test-1.4.tar.gz" in url:
68+
response.headers = {"Content-Length": "6144"}
69+
elif "test-1.5.tar.gz" in url:
70+
response.headers = {"Content-Length": "7168"}
71+
else:
72+
response.headers = {"Content-Length": "1000"}
73+
74+
return response
75+
76+
mock_http_client.head = AsyncMock(side_effect=mock_head)
77+
78+
# Create enrichment repository
79+
enrichment_repo = FileSizeEnrichmentRepository(
80+
source=fake_repository,
81+
http_client=mock_http_client,
82+
max_concurrent_requests=3,
83+
)
84+
85+
# Test that sizes are enriched
86+
result = await enrichment_repo.get_project_page("test-project")
87+
88+
# Check that all files have the expected sizes
89+
expected_sizes = [1024, 2048, 3072, 4096, 5120, 6144, 7168]
90+
for i, file in enumerate(result.files):
91+
assert file.size == expected_sizes[i]
92+
93+
# Verify that HEAD requests were made for all files
94+
assert mock_http_client.head.call_count == 7

0 commit comments

Comments
 (0)