Skip to content

Extract the ability to get file size via http HEAD request into its own component #18

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Jul 18, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 5 additions & 2 deletions simple_repository_browser/_app.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
from simple_repository.components.local import LocalRepository

from . import controller, crawler, errors, fetch_projects, model, view
from .filesize_enrichment import FileSizeEnrichmentRepository
from .metadata_injector import MetadataInjector
from .static_files import generate_manifest

Expand Down Expand Up @@ -141,9 +142,11 @@ def _repo_from_url(
def create_model(
self, http_client: httpx.AsyncClient, database: aiosqlite.Connection
) -> model.Model:
source = MetadataInjector(
self._repo_from_url(self.repository_url, http_client=http_client),
base_repo = self._repo_from_url(self.repository_url, http_client=http_client)
source = FileSizeEnrichmentRepository(
MetadataInjector(base_repo, http_client=http_client),
http_client=http_client,
max_concurrent_requests=10,
)
return model.Model(
source=source,
Expand Down
29 changes: 2 additions & 27 deletions simple_repository_browser/fetch_description.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
import asyncio
import dataclasses
import datetime
import email.parser
Expand Down Expand Up @@ -144,35 +143,11 @@ async def package_info(

files_info: typing.Dict[str, FileInfo] = {}

# Get the size from the repository, if possible.
# Get the size from the repository files
for file in files:
if file.size:
files_info[file.filename] = FileInfo(
size=file.size,
)

limited_concurrency = asyncio.Semaphore(10)
# Compute the size of each file.
# TODO: This should be done as part of the repository component interface.
async with httpx.AsyncClient(verify=False) as http_client:

async def semaphored_head(filename: str, url: str):
async with limited_concurrency:
headers: dict[str, str] = {}
return (
filename,
await http_client.head(url, follow_redirects=True, headers=headers),
)

coros = [
semaphored_head(file.filename, file.url)
for file in files
if file.filename not in files_info
]
for coro in asyncio.as_completed(coros):
filename, response = await coro
files_info[filename] = FileInfo(
size=int(response.headers["Content-Length"]),
size=file.size or 0,
)

file = files[0]
Expand Down
147 changes: 147 additions & 0 deletions simple_repository_browser/filesize_enrichment.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,147 @@
"""
FileSizeEnrichmentRepository component for adding file size information to project pages.

This component wraps another repository and automatically enriches file metadata
with size information by making HTTP HEAD requests to files that don't already
have size information.
"""

import asyncio
from dataclasses import replace
import logging
import typing

import httpx
from simple_repository import SimpleRepository, model
from simple_repository.components.core import RepositoryContainer

from ._typing_compat import override

logger = logging.getLogger(__name__)


class FileSizeEnrichmentRepository(RepositoryContainer):
"""
Repository component that enriches file metadata with size information.

This component automatically adds size information to files that don't already
have it by making HTTP HEAD requests. It maintains parallelism for efficiency
while respecting concurrency limits.
"""

def __init__(
self,
source: SimpleRepository,
http_client: httpx.AsyncClient,
*,
max_concurrent_requests: int = 10,
) -> None:
"""
Initialize the FileSizeEnrichmentRepository.

Parameters
----------
source: The underlying repository to wrap

http_client: HTTP client for making HEAD requests

max_concurrent_requests: Maximum number of concurrent HEAD requests
"""
super().__init__(source)
self.http_client = http_client
self.semaphore = asyncio.Semaphore(max_concurrent_requests)

@override
async def get_project_page(
self,
project_name: str,
*,
request_context: model.RequestContext = model.RequestContext.DEFAULT,
) -> model.ProjectDetail:
"""
Get project page with file sizes enriched.

Files that don't have size information will have their sizes fetched
via HTTP HEAD requests in parallel.
"""
project_page = await super().get_project_page(
project_name, request_context=request_context
)

# Identify files that need size information
files_needing_size = [
file for file in project_page.files if not file.size and file.url
]

if not files_needing_size:
# No files need size information, return as-is
return project_page

# Fetch sizes for files that need them
size_info = await self._fetch_file_sizes(files_needing_size)

# Create new files with updated size information
enriched_files = []
for file in project_page.files:
if file.filename in size_info:
file = replace(file, size=size_info[file.filename])
enriched_files.append(file)

return replace(project_page, files=tuple(enriched_files))

async def _fetch_file_sizes(
self, files: typing.List[model.File]
) -> typing.Dict[str, int]:
"""
Fetch file sizes for multiple files in parallel.

Args:
files: List of files to fetch sizes for

Returns:
Dictionary mapping filename to size in bytes
"""

async def fetch_single_file_size(
file: model.File,
) -> typing.Tuple[str, typing.Optional[int]]:
"""Fetch size for a single file with semaphore protection."""
async with self.semaphore:
try:
logger.debug(f"Fetching size for {file.filename} from {file.url}")

# Make HEAD request to get Content-Length
response = await self.http_client.head(
file.url, follow_redirects=True, headers={}
)
response.raise_for_status()

content_length = response.headers.get("Content-Length")
if content_length:
return file.filename, int(content_length)
else:
logger.warning(f"No Content-Length header for {file.filename}")
return file.filename, None

except BaseException as e:
logger.warning(f"Failed to get size for {file.filename}: {e}")
return file.filename, None

# Create tasks for all files
tasks = [fetch_single_file_size(file) for file in files]

# Wait for all tasks to complete
results = await asyncio.gather(*tasks, return_exceptions=True)

# Process results, filtering out failures
size_info = {}
for result in results:
if isinstance(result, BaseException):
logger.warning(f"Exception occurred during size fetching: {result}")
continue

filename, size = result
if size is not None:
size_info[filename] = size

return size_info
94 changes: 94 additions & 0 deletions simple_repository_browser/tests/test_filesize_enrichment.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
import typing
from unittest.mock import AsyncMock, MagicMock

import pytest
from simple_repository import SimpleRepository, model
import simple_repository.errors

from .._typing_compat import override
from ..filesize_enrichment import FileSizeEnrichmentRepository


class FakeRepository(SimpleRepository):
def __init__(self) -> None:
self.project_pages: dict[str, model.ProjectDetail] = {}

@override
async def get_project_page(
self,
project_name: str,
*,
request_context: typing.Optional[model.RequestContext] = None,
) -> model.ProjectDetail:
try:
return self.project_pages[project_name]
except KeyError:
raise simple_repository.errors.PackageNotFoundError(project_name)


@pytest.mark.asyncio
async def test_successful_size_enrichment() -> None:
"""Test successful enrichment of file sizes."""
project_page = model.ProjectDetail(
meta=model.Meta("1.0"),
name="test-project",
files=(
model.File("test-1.0.whl", "http://example.com/test-1.0.whl", {}),
model.File("test-1.0.tar.gz", "http://example.com/test-1.0.tar.gz", {}),
model.File("test-1.1.tar.gz", "http://example.com/test-1.1.tar.gz", {}),
model.File("test-1.2.tar.gz", "http://example.com/test-1.2.tar.gz", {}),
model.File("test-1.3.tar.gz", "http://example.com/test-1.3.tar.gz", {}),
model.File("test-1.4.tar.gz", "http://example.com/test-1.4.tar.gz", {}),
model.File("test-1.5.tar.gz", "http://example.com/test-1.5.tar.gz", {}),
),
)
fake_repository = FakeRepository()
fake_repository.project_pages["test-project"] = project_page

# Create mock HTTP client that returns Content-Length headers
mock_http_client = MagicMock()

async def mock_head(url: str, **kwargs):
"""Mock HEAD request that returns Content-Length based on filename."""
response = MagicMock()
response.raise_for_status.return_value = None

# Return different sizes based on URL
if "test-1.0.whl" in url:
response.headers = {"Content-Length": "1024"}
elif "test-1.0.tar.gz" in url:
response.headers = {"Content-Length": "2048"}
elif "test-1.1.tar.gz" in url:
response.headers = {"Content-Length": "3072"}
elif "test-1.2.tar.gz" in url:
response.headers = {"Content-Length": "4096"}
elif "test-1.3.tar.gz" in url:
response.headers = {"Content-Length": "5120"}
elif "test-1.4.tar.gz" in url:
response.headers = {"Content-Length": "6144"}
elif "test-1.5.tar.gz" in url:
response.headers = {"Content-Length": "7168"}
else:
response.headers = {"Content-Length": "1000"}

return response

mock_http_client.head = AsyncMock(side_effect=mock_head)

# Create enrichment repository
enrichment_repo = FileSizeEnrichmentRepository(
source=fake_repository,
http_client=mock_http_client,
max_concurrent_requests=3,
)

# Test that sizes are enriched
result = await enrichment_repo.get_project_page("test-project")

# Check that all files have the expected sizes
expected_sizes = [1024, 2048, 3072, 4096, 5120, 6144, 7168]
for i, file in enumerate(result.files):
assert file.size == expected_sizes[i]

# Verify that HEAD requests were made for all files
assert mock_http_client.head.call_count == 7