From 2771ff06a41f1935f6e41fc22913e1f2b1ee191f Mon Sep 17 00:00:00 2001 From: Phil Elson Date: Fri, 18 Jul 2025 15:20:04 +0200 Subject: [PATCH] Extract the ability to get file size via http HEAD request into its own component --- simple_repository_browser/_app.py | 7 +- .../fetch_description.py | 29 +--- .../filesize_enrichment.py | 147 ++++++++++++++++++ .../tests/test_filesize_enrichment.py | 94 +++++++++++ 4 files changed, 248 insertions(+), 29 deletions(-) create mode 100644 simple_repository_browser/filesize_enrichment.py create mode 100644 simple_repository_browser/tests/test_filesize_enrichment.py diff --git a/simple_repository_browser/_app.py b/simple_repository_browser/_app.py index 92f72a3..cc6bd44 100644 --- a/simple_repository_browser/_app.py +++ b/simple_repository_browser/_app.py @@ -14,6 +14,7 @@ from simple_repository.components.local import LocalRepository from . import controller, crawler, errors, fetch_projects, model, view +from .filesize_enrichment import FileSizeEnrichmentRepository from .metadata_injector import MetadataInjector from .static_files import generate_manifest @@ -141,9 +142,11 @@ def _repo_from_url( def create_model( self, http_client: httpx.AsyncClient, database: aiosqlite.Connection ) -> model.Model: - source = MetadataInjector( - self._repo_from_url(self.repository_url, http_client=http_client), + base_repo = self._repo_from_url(self.repository_url, http_client=http_client) + source = FileSizeEnrichmentRepository( + MetadataInjector(base_repo, http_client=http_client), http_client=http_client, + max_concurrent_requests=10, ) return model.Model( source=source, diff --git a/simple_repository_browser/fetch_description.py b/simple_repository_browser/fetch_description.py index 2ed13b4..d4e747f 100644 --- a/simple_repository_browser/fetch_description.py +++ b/simple_repository_browser/fetch_description.py @@ -1,4 +1,3 @@ -import asyncio import dataclasses import datetime import email.parser @@ -144,35 +143,11 @@ async def package_info( files_info: typing.Dict[str, FileInfo] = {} - # Get the size from the repository, if possible. + # Get the size from the repository files for file in files: if file.size: files_info[file.filename] = FileInfo( - size=file.size, - ) - - limited_concurrency = asyncio.Semaphore(10) - # Compute the size of each file. - # TODO: This should be done as part of the repository component interface. - async with httpx.AsyncClient(verify=False) as http_client: - - async def semaphored_head(filename: str, url: str): - async with limited_concurrency: - headers: dict[str, str] = {} - return ( - filename, - await http_client.head(url, follow_redirects=True, headers=headers), - ) - - coros = [ - semaphored_head(file.filename, file.url) - for file in files - if file.filename not in files_info - ] - for coro in asyncio.as_completed(coros): - filename, response = await coro - files_info[filename] = FileInfo( - size=int(response.headers["Content-Length"]), + size=file.size or 0, ) file = files[0] diff --git a/simple_repository_browser/filesize_enrichment.py b/simple_repository_browser/filesize_enrichment.py new file mode 100644 index 0000000..0d9a43d --- /dev/null +++ b/simple_repository_browser/filesize_enrichment.py @@ -0,0 +1,147 @@ +""" +FileSizeEnrichmentRepository component for adding file size information to project pages. + +This component wraps another repository and automatically enriches file metadata +with size information by making HTTP HEAD requests to files that don't already +have size information. +""" + +import asyncio +from dataclasses import replace +import logging +import typing + +import httpx +from simple_repository import SimpleRepository, model +from simple_repository.components.core import RepositoryContainer + +from ._typing_compat import override + +logger = logging.getLogger(__name__) + + +class FileSizeEnrichmentRepository(RepositoryContainer): + """ + Repository component that enriches file metadata with size information. + + This component automatically adds size information to files that don't already + have it by making HTTP HEAD requests. It maintains parallelism for efficiency + while respecting concurrency limits. + """ + + def __init__( + self, + source: SimpleRepository, + http_client: httpx.AsyncClient, + *, + max_concurrent_requests: int = 10, + ) -> None: + """ + Initialize the FileSizeEnrichmentRepository. + + Parameters + ---------- + source: The underlying repository to wrap + + http_client: HTTP client for making HEAD requests + + max_concurrent_requests: Maximum number of concurrent HEAD requests + """ + super().__init__(source) + self.http_client = http_client + self.semaphore = asyncio.Semaphore(max_concurrent_requests) + + @override + async def get_project_page( + self, + project_name: str, + *, + request_context: model.RequestContext = model.RequestContext.DEFAULT, + ) -> model.ProjectDetail: + """ + Get project page with file sizes enriched. + + Files that don't have size information will have their sizes fetched + via HTTP HEAD requests in parallel. + """ + project_page = await super().get_project_page( + project_name, request_context=request_context + ) + + # Identify files that need size information + files_needing_size = [ + file for file in project_page.files if not file.size and file.url + ] + + if not files_needing_size: + # No files need size information, return as-is + return project_page + + # Fetch sizes for files that need them + size_info = await self._fetch_file_sizes(files_needing_size) + + # Create new files with updated size information + enriched_files = [] + for file in project_page.files: + if file.filename in size_info: + file = replace(file, size=size_info[file.filename]) + enriched_files.append(file) + + return replace(project_page, files=tuple(enriched_files)) + + async def _fetch_file_sizes( + self, files: typing.List[model.File] + ) -> typing.Dict[str, int]: + """ + Fetch file sizes for multiple files in parallel. + + Args: + files: List of files to fetch sizes for + + Returns: + Dictionary mapping filename to size in bytes + """ + + async def fetch_single_file_size( + file: model.File, + ) -> typing.Tuple[str, typing.Optional[int]]: + """Fetch size for a single file with semaphore protection.""" + async with self.semaphore: + try: + logger.debug(f"Fetching size for {file.filename} from {file.url}") + + # Make HEAD request to get Content-Length + response = await self.http_client.head( + file.url, follow_redirects=True, headers={} + ) + response.raise_for_status() + + content_length = response.headers.get("Content-Length") + if content_length: + return file.filename, int(content_length) + else: + logger.warning(f"No Content-Length header for {file.filename}") + return file.filename, None + + except BaseException as e: + logger.warning(f"Failed to get size for {file.filename}: {e}") + return file.filename, None + + # Create tasks for all files + tasks = [fetch_single_file_size(file) for file in files] + + # Wait for all tasks to complete + results = await asyncio.gather(*tasks, return_exceptions=True) + + # Process results, filtering out failures + size_info = {} + for result in results: + if isinstance(result, BaseException): + logger.warning(f"Exception occurred during size fetching: {result}") + continue + + filename, size = result + if size is not None: + size_info[filename] = size + + return size_info diff --git a/simple_repository_browser/tests/test_filesize_enrichment.py b/simple_repository_browser/tests/test_filesize_enrichment.py new file mode 100644 index 0000000..d9a1fdf --- /dev/null +++ b/simple_repository_browser/tests/test_filesize_enrichment.py @@ -0,0 +1,94 @@ +import typing +from unittest.mock import AsyncMock, MagicMock + +import pytest +from simple_repository import SimpleRepository, model +import simple_repository.errors + +from .._typing_compat import override +from ..filesize_enrichment import FileSizeEnrichmentRepository + + +class FakeRepository(SimpleRepository): + def __init__(self) -> None: + self.project_pages: dict[str, model.ProjectDetail] = {} + + @override + async def get_project_page( + self, + project_name: str, + *, + request_context: typing.Optional[model.RequestContext] = None, + ) -> model.ProjectDetail: + try: + return self.project_pages[project_name] + except KeyError: + raise simple_repository.errors.PackageNotFoundError(project_name) + + +@pytest.mark.asyncio +async def test_successful_size_enrichment() -> None: + """Test successful enrichment of file sizes.""" + project_page = model.ProjectDetail( + meta=model.Meta("1.0"), + name="test-project", + files=( + model.File("test-1.0.whl", "http://example.com/test-1.0.whl", {}), + model.File("test-1.0.tar.gz", "http://example.com/test-1.0.tar.gz", {}), + model.File("test-1.1.tar.gz", "http://example.com/test-1.1.tar.gz", {}), + model.File("test-1.2.tar.gz", "http://example.com/test-1.2.tar.gz", {}), + model.File("test-1.3.tar.gz", "http://example.com/test-1.3.tar.gz", {}), + model.File("test-1.4.tar.gz", "http://example.com/test-1.4.tar.gz", {}), + model.File("test-1.5.tar.gz", "http://example.com/test-1.5.tar.gz", {}), + ), + ) + fake_repository = FakeRepository() + fake_repository.project_pages["test-project"] = project_page + + # Create mock HTTP client that returns Content-Length headers + mock_http_client = MagicMock() + + async def mock_head(url: str, **kwargs): + """Mock HEAD request that returns Content-Length based on filename.""" + response = MagicMock() + response.raise_for_status.return_value = None + + # Return different sizes based on URL + if "test-1.0.whl" in url: + response.headers = {"Content-Length": "1024"} + elif "test-1.0.tar.gz" in url: + response.headers = {"Content-Length": "2048"} + elif "test-1.1.tar.gz" in url: + response.headers = {"Content-Length": "3072"} + elif "test-1.2.tar.gz" in url: + response.headers = {"Content-Length": "4096"} + elif "test-1.3.tar.gz" in url: + response.headers = {"Content-Length": "5120"} + elif "test-1.4.tar.gz" in url: + response.headers = {"Content-Length": "6144"} + elif "test-1.5.tar.gz" in url: + response.headers = {"Content-Length": "7168"} + else: + response.headers = {"Content-Length": "1000"} + + return response + + mock_http_client.head = AsyncMock(side_effect=mock_head) + + # Create enrichment repository + enrichment_repo = FileSizeEnrichmentRepository( + source=fake_repository, + http_client=mock_http_client, + max_concurrent_requests=3, + ) + + # Test that sizes are enriched + result = await enrichment_repo.get_project_page("test-project") + + # Check that all files have the expected sizes + expected_sizes = [1024, 2048, 3072, 4096, 5120, 6144, 7168] + for i, file in enumerate(result.files): + assert file.size == expected_sizes[i] + + # Verify that HEAD requests were made for all files + assert mock_http_client.head.call_count == 7