Skip to content

Commit 0b498b7

Browse files
authored
Merge pull request #17 from simple-repository/feature/metadata-injector
Refine the MetadataInjector implementation
2 parents afcf576 + 424e4d9 commit 0b498b7

File tree

4 files changed

+285
-32
lines changed

4 files changed

+285
-32
lines changed

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@ Homepage = "https://github.com/simple-repository/simple-repository-browser"
4040
[project.optional-dependencies]
4141
test = [
4242
"pytest",
43+
"pytest-asyncio",
4344
]
4445
dev = [
4546
"simple-repository-browser[test]",
Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
import sys
2+
import typing
3+
4+
if typing.TYPE_CHECKING:
5+
if sys.version_info >= (3, 11):
6+
from typing import Self
7+
else:
8+
from typing_extensions import Self
9+
10+
if sys.version_info >= (3, 12):
11+
from typing import override
12+
else:
13+
from typing_extensions import override
14+
15+
if sys.version_info >= (3, 8):
16+
from typing import Protocol, TypedDict
17+
else:
18+
from typing_extensions import Protocol, TypedDict
19+
20+
if sys.version_info >= (3, 10):
21+
from typing import TypeAlias
22+
else:
23+
from typing_extensions import TypeAlias
24+
25+
__all__ = [
26+
"override",
27+
"Self",
28+
"TypedDict",
29+
"TypeAlias",
30+
"Protocol",
31+
]
Lines changed: 99 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -1,68 +1,135 @@
1+
"""
2+
Extended MetadataInjector that supports sdist (.tar.gz) and zip (.zip) formats.
3+
4+
This extends SimpleRepository's MetadataInjectorRepository to provide metadata extraction
5+
for package formats beyond wheels.
6+
7+
"""
8+
19
from dataclasses import replace
210
import pathlib
311
import tarfile
12+
import typing
413
import zipfile
514

615
from simple_repository import model
716
from simple_repository.components.metadata_injector import MetadataInjectorRepository
817

918

10-
def get_metadata_from_sdist(package_path: pathlib.Path) -> str:
11-
archive = tarfile.TarFile.open(package_path)
12-
names = archive.getnames()
19+
def _extract_pkg_info_from_archive(
20+
archive_names: typing.List[str],
21+
extract_func: typing.Callable[[str], typing.Optional[typing.IO[bytes]]],
22+
package_name: str,
23+
) -> str:
24+
"""
25+
Extract PKG-INFO metadata from an archive.
26+
27+
Args:
28+
archive_names: List of file names in the archive
29+
extract_func: Function to extract a file from the archive
30+
package_name: Name of the package for error messages
31+
32+
Returns:
33+
Metadata content as string
1334
14-
pkg_info_files = [x.split("/") for x in names if "PKG-INFO" in x]
35+
Raises:
36+
ValueError: If no valid metadata is found
37+
"""
38+
pkg_info_files = [x.split("/") for x in archive_names if "PKG-INFO" in x]
39+
# Sort by path length (descending) to prefer more specific/nested metadata files
1540
ordered_pkg_info = sorted(pkg_info_files, key=lambda pth: -len(pth))
1641

1742
for path in ordered_pkg_info:
1843
candidate = "/".join(path)
19-
f = archive.extractfile(candidate)
44+
f = extract_func(candidate)
2045
if f is None:
2146
continue
22-
data = f.read().decode()
23-
if "Metadata-Version" in data:
24-
return data
25-
raise ValueError(f"No metadata found in {package_path.name}")
47+
try:
48+
data = f.read().decode("utf-8")
49+
if "Metadata-Version" in data:
50+
return data
51+
except (UnicodeDecodeError, OSError):
52+
# Skip files that can't be decoded or read
53+
continue
54+
55+
raise ValueError(f"No valid PKG-INFO metadata found in {package_name}")
56+
57+
58+
def get_metadata_from_sdist(package_path: pathlib.Path) -> str:
59+
"""Extract metadata from a source distribution (.tar.gz file)."""
60+
with tarfile.TarFile.open(package_path) as archive:
61+
names = archive.getnames()
62+
63+
def extract_func(candidate: str) -> typing.Optional[typing.IO[bytes]]:
64+
return archive.extractfile(candidate)
65+
66+
return _extract_pkg_info_from_archive(names, extract_func, package_path.name)
2667

2768

2869
def get_metadata_from_zip(package_path: pathlib.Path) -> str:
29-
# Used by pyreadline. (a zipfile)
70+
"""Extract metadata from a zip file (legacy format, used by packages like pyreadline)."""
3071
with zipfile.ZipFile(package_path) as archive:
3172
names = archive.namelist()
3273

33-
pkg_info_files = [x.split("/") for x in names if "PKG-INFO" in x]
34-
ordered_pkg_info = sorted(pkg_info_files, key=lambda pth: -len(pth))
74+
def extract_func(candidate: str) -> typing.Optional[typing.IO[bytes]]:
75+
try:
76+
return archive.open(candidate, mode="r")
77+
except (KeyError, zipfile.BadZipFile):
78+
return None
3579

36-
for path in ordered_pkg_info:
37-
candidate = "/".join(path)
38-
f = archive.open(candidate, mode="r")
39-
if f is None:
40-
continue
41-
data = f.read().decode()
42-
if "Metadata-Version" in data:
43-
return data
44-
raise ValueError(f"No metadata found in {package_path.name}")
80+
return _extract_pkg_info_from_archive(names, extract_func, package_path.name)
4581

4682

4783
class MetadataInjector(MetadataInjectorRepository):
84+
"""
85+
Extended MetadataInjector that supports multiple package formats.
86+
87+
This class extends SimpleRepository's MetadataInjectorRepository to provide
88+
metadata extraction for:
89+
- Wheel files (.whl) - handled by parent class
90+
- Source distributions (.tar.gz) - contains PKG-INFO files
91+
- Zip files (.zip) - legacy format used by some packages
92+
"""
93+
94+
# Map of supported file extensions to their extraction functions
95+
_EXTRACTORS: typing.Dict[
96+
str, typing.Callable[["MetadataInjector", pathlib.Path], str]
97+
] = {
98+
".whl": lambda self, path: self._get_metadata_from_wheel(path),
99+
".tar.gz": lambda self, path: get_metadata_from_sdist(path),
100+
".zip": lambda self, path: get_metadata_from_zip(path),
101+
}
102+
48103
def _get_metadata_from_package(self, package_path: pathlib.Path) -> str:
49-
if package_path.name.endswith(".whl"):
50-
return self._get_metadata_from_wheel(package_path)
51-
elif package_path.name.endswith(".tar.gz"):
52-
return get_metadata_from_sdist(package_path)
53-
elif package_path.name.endswith(".zip"):
54-
return get_metadata_from_zip(package_path)
55-
raise ValueError("Package provided is not a wheel")
104+
"""Extract metadata from a package file based on its extension."""
105+
package_name = package_path.name
106+
107+
for extension, extractor in self._EXTRACTORS.items():
108+
if package_name.endswith(extension):
109+
return extractor(self, package_path)
110+
111+
# Provide more descriptive error message
112+
supported_formats = ", ".join(self._EXTRACTORS.keys())
113+
raise ValueError(
114+
f"Unsupported package format: {package_name}. "
115+
f"Supported formats: {supported_formats}"
116+
)
56117

57118
def _add_metadata_attribute(
58119
self,
59120
project_page: model.ProjectDetail,
60121
) -> model.ProjectDetail:
61-
"""Add the data-core-metadata to all the packages distributed as wheels"""
122+
"""
123+
Add the data-core-metadata attribute to all supported package files.
124+
125+
Unlike the parent class which only adds metadata attributes to wheel files,
126+
this implementation adds them to all files with URLs, enabling metadata
127+
requests for sdist and zip files as well.
128+
"""
62129
files = []
63130
for file in project_page.files:
64-
if file.url and not file.dist_info_metadata:
131+
matching_extension = file.filename.endswith(tuple(self._EXTRACTORS.keys()))
132+
if matching_extension and not file.dist_info_metadata:
65133
file = replace(file, dist_info_metadata=True)
66134
files.append(file)
67-
project_page = replace(project_page, files=tuple(files))
68-
return project_page
135+
return replace(project_page, files=tuple(files))
Lines changed: 154 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,154 @@
1+
import pathlib
2+
import typing
3+
4+
import httpx
5+
import pytest
6+
from simple_repository import SimpleRepository, model
7+
import simple_repository.errors
8+
9+
from .._typing_compat import override
10+
from ..metadata_injector import MetadataInjector
11+
12+
13+
class FakeRepository(SimpleRepository):
14+
"""A repository which"""
15+
16+
def __init__(self) -> None:
17+
self.project_pages: dict[str, model.ProjectDetail] = {}
18+
self.resources: dict[str, model.Resource] = {}
19+
20+
@override
21+
async def get_project_page(
22+
self,
23+
project_name: str,
24+
*,
25+
request_context: typing.Optional[model.RequestContext] = None,
26+
) -> model.ProjectDetail:
27+
try:
28+
return self.project_pages[project_name]
29+
except:
30+
raise simple_repository.errors.PackageNotFoundError(project_name)
31+
32+
@override
33+
async def get_resource(
34+
self,
35+
project_name: str,
36+
resource_name: str,
37+
*,
38+
request_context: typing.Optional[model.RequestContext] = None,
39+
) -> model.Resource:
40+
try:
41+
return self.resources[resource_name]
42+
except:
43+
raise simple_repository.errors.ResourceUnavailable(resource_name)
44+
45+
46+
@pytest.fixture
47+
def repository() -> MetadataInjector:
48+
return MetadataInjector(
49+
source=FakeRepository(),
50+
http_client=httpx.AsyncClient(),
51+
)
52+
53+
54+
@pytest.fixture(scope="session")
55+
def cache_dir() -> pathlib.Path:
56+
cache_path = pathlib.Path(__file__).parent / "cache"
57+
cache_path.mkdir(exist_ok=True)
58+
return cache_path
59+
60+
61+
async def download_package(url: str, cache_dir: pathlib.Path) -> pathlib.Path:
62+
"""Download package to cache if not already present."""
63+
filename = url.split("/")[-1]
64+
cache_path = cache_dir / filename
65+
66+
if cache_path.exists():
67+
return cache_path
68+
69+
async with httpx.AsyncClient(timeout=30.0) as client:
70+
response = await client.get(url)
71+
response.raise_for_status()
72+
cache_path.write_bytes(response.content)
73+
74+
return cache_path
75+
76+
77+
@pytest.mark.parametrize(
78+
["url", "project", "version"],
79+
[
80+
(
81+
"https://files.pythonhosted.org/packages/a7/56/5f481ac5fcde5eb0fcfd5f3a421c0b345842cd9e0019048b8adeb17a3ecc/simple_repository-0.9.0-py3-none-any.whl",
82+
"simple-repository",
83+
"0.9.0",
84+
),
85+
(
86+
"https://files.pythonhosted.org/packages/2e/19/d7c972dfe90a353dbd3efbbe1d14a5951de80c99c9dc1b93cd998d51dc0f/numpy-2.3.1.tar.gz",
87+
"numpy",
88+
"2.3.1",
89+
),
90+
(
91+
"https://files.pythonhosted.org/packages/5e/20/91f4ed6fdc3c399fc58e9af1f812a1f5cb002f479494ecacc39b6be96032/numpy-1.10.0.post2.tar.gz",
92+
"numpy",
93+
"1.10.0.post2",
94+
),
95+
(
96+
"https://files.pythonhosted.org/packages/bc/7c/d724ef1ec3ab2125f38a1d53285745445ec4a8f19b9bb0761b4064316679/pyreadline-2.1.zip",
97+
"pyreadline",
98+
"2.1",
99+
),
100+
],
101+
)
102+
@pytest.mark.asyncio
103+
async def test_get_metadata_from_packages(
104+
cache_dir: pathlib.Path,
105+
repository: MetadataInjector,
106+
url: str,
107+
project: str,
108+
version: str,
109+
) -> None:
110+
"""Test metadata extraction from different package formats."""
111+
package_path = await download_package(url, cache_dir)
112+
113+
# Create a fake resource to test get_resource
114+
filename = url.split("/")[-1]
115+
fake_resource = model.LocalResource(package_path)
116+
117+
fake_root = typing.cast(FakeRepository, repository.source)
118+
fake_root.resources[filename] = fake_resource
119+
120+
# Test get_resource returns metadata
121+
resource = await repository.get_resource(project, filename + ".metadata")
122+
assert isinstance(resource, model.TextResource)
123+
124+
metadata = resource.text
125+
126+
assert "Metadata-Version" in metadata
127+
assert f"Name: {project}" in metadata
128+
assert f"Version: {version}" in metadata
129+
130+
131+
@pytest.mark.asyncio
132+
async def test_add_metadata_attribute_adds_to_all_files(
133+
repository: MetadataInjector,
134+
) -> None:
135+
"""Test metadata attributes added to all files with URLs."""
136+
project_page = model.ProjectDetail(
137+
meta=model.Meta("1.0"),
138+
name="test-project",
139+
files=(
140+
model.File("test-1.0.whl", "", {}),
141+
model.File("test-1.0.tar.gz", "", {}),
142+
model.File("test-1.0.zip", "", {}),
143+
model.File("test-1.0.egg", "", {}),
144+
),
145+
)
146+
source = typing.cast(FakeRepository, repository.source)
147+
source.project_pages["test-project"] = project_page
148+
149+
detail = await repository.get_project_page("test-project")
150+
151+
assert detail.files[0].dist_info_metadata is True # .whl
152+
assert detail.files[1].dist_info_metadata is True # .tar.gz
153+
assert detail.files[2].dist_info_metadata is True # .zip
154+
assert detail.files[3].dist_info_metadata is None # .egg (no URL)

0 commit comments

Comments
 (0)