From 77d610e2a4d352ca236604c32fec3b3d63365123 Mon Sep 17 00:00:00 2001 From: Julia Signell Date: Tue, 5 Nov 2024 16:19:22 -0500 Subject: [PATCH 1/2] When making self links absolute, also add `file://` as the scheme --- pystac/asset.py | 11 ++++++--- pystac/link.py | 2 +- pystac/stac_io.py | 5 ++-- pystac/utils.py | 62 ++++++++++++++++++++++++++++++++--------------- 4 files changed, 55 insertions(+), 25 deletions(-) diff --git a/pystac/asset.py b/pystac/asset.py index e42b8237b..141938d01 100644 --- a/pystac/asset.py +++ b/pystac/asset.py @@ -8,7 +8,12 @@ from pystac import MediaType, STACError, common_metadata, utils from pystac.html.jinja_env import get_jinja_env -from pystac.utils import is_absolute_href, make_absolute_href, make_relative_href +from pystac.utils import ( + is_absolute_href, + make_absolute_href, + make_relative_href, + safe_urlparse, +) if TYPE_CHECKING: from pystac.common_metadata import CommonMetadata @@ -380,7 +385,7 @@ def get_self_href(self) -> str | None: def _absolute_href(href: str, owner: Assets | None, action: str = "access") -> str: if utils.is_absolute_href(href): - return href + return safe_urlparse(href).path else: item_self = owner.get_self_href() if owner else None if item_self is None: @@ -389,4 +394,4 @@ def _absolute_href(href: str, owner: Assets | None, action: str = "access") -> s "and owner item is not set. Hint: try using " ":func:`~pystac.Item.make_asset_hrefs_absolute`" ) - return utils.make_absolute_href(href, item_self) + return safe_urlparse(utils.make_absolute_href(href, item_self)).path diff --git a/pystac/link.py b/pystac/link.py index f88061090..1d7d97bad 100644 --- a/pystac/link.py +++ b/pystac/link.py @@ -102,7 +102,7 @@ def __init__( self.rel = rel if isinstance(target, str): if rel == pystac.RelType.SELF: - self._target_href = make_absolute_href(target) + self._target_href = make_absolute_href(target, must_include_scheme=True) else: self._target_href = make_posix_style(target) self._target_object = None diff --git a/pystac/stac_io.py b/pystac/stac_io.py index abe5bc26a..4bfe7488a 100644 --- a/pystac/stac_io.py +++ b/pystac/stac_io.py @@ -303,6 +303,7 @@ def read_text_from_href(self, href: str) -> str: except HTTPError as e: raise Exception(f"Could not read uri {href}") from e else: + href = safe_urlparse(href).path with open(href, encoding="utf-8") as f: href_contents = f.read() return href_contents @@ -328,7 +329,7 @@ def write_text_to_href(self, href: str, txt: str) -> None: """ if _is_url(href): raise NotImplementedError("DefaultStacIO cannot write to urls") - href = os.fspath(href) + href = safe_urlparse(href).path dirname = os.path.dirname(href) if dirname != "" and not os.path.isdir(dirname): os.makedirs(dirname) @@ -391,7 +392,7 @@ def _report_duplicate_object_names( def _is_url(href: str) -> bool: parsed = safe_urlparse(href) - return parsed.scheme != "" + return parsed.scheme != "" and parsed.scheme != "file" if HAS_URLLIB3: diff --git a/pystac/utils.py b/pystac/utils.py index 76c3f7100..478daaa43 100644 --- a/pystac/utils.py +++ b/pystac/utils.py @@ -288,34 +288,54 @@ def _make_absolute_href_path( parsed_source: URLParseResult, parsed_start: URLParseResult, start_is_dir: bool = False, + must_include_scheme: bool = False, ) -> str: - # If the source is already absolute, just return it + # If the source is already absolute and doesn't need a scheme just return it if os.path.isabs(parsed_source.path): - return urlunparse(parsed_source) - - # If the start path is not a directory, get the parent directory - start_dir = ( - parsed_start.path if start_is_dir else os.path.dirname(parsed_start.path) - ) + if must_include_scheme: + abs_path = parsed_source.path + else: + return urlunparse(parsed_source) + else: + # If the start path is not a directory, get the parent directory + start_dir = ( + parsed_start.path if start_is_dir else os.path.dirname(parsed_start.path) + ) - # Join the start directory to the relative path and find the absolute path - abs_path = make_posix_style( - os.path.abspath(os.path.join(start_dir, parsed_source.path)) - ) + # Join the start directory to the relative path and find the absolute path + abs_path = make_posix_style( + os.path.abspath(os.path.join(start_dir, parsed_source.path)) + ) - # Account for the normalization of abspath for - # things like /vsitar// prefixes by replacing the - # original start_dir text when abspath modifies the start_dir. - if not start_dir == make_posix_style(os.path.abspath(start_dir)): - abs_path = abs_path.replace( - make_posix_style(os.path.abspath(start_dir)), start_dir + # Account for the normalization of abspath for + # things like /vsitar// prefixes by replacing the + # original start_dir text when abspath modifies the start_dir. + if not start_dir == make_posix_style(os.path.abspath(start_dir)): + abs_path = abs_path.replace( + make_posix_style(os.path.abspath(start_dir)), start_dir + ) + + # add a scheme if there isn't one already + if must_include_scheme: + return urlunparse( + ( + "file", + parsed_start.netloc, + abs_path, + parsed_source.params, + parsed_source.query, + parsed_source.fragment, + ) ) return abs_path def make_absolute_href( - source_href: str, start_href: str | None = None, start_is_dir: bool = False + source_href: str, + start_href: str | None = None, + start_is_dir: bool = False, + must_include_scheme: bool = False, ) -> str: """Returns a new string that represents ``source_href`` as an absolute path. If ``source_href`` is already absolute it is returned unchanged. If ``source_href`` @@ -332,6 +352,8 @@ def make_absolute_href( start_is_dir : If ``True``, ``start_href`` is treated as a directory. Otherwise, ``start_href`` is considered to be a path to a file. Defaults to ``False``. + must_include_scheme : If ``True``, every output will have a scheme. This means + that local filepaths will be prefixed with `file://`. Defaults to ``False``. Returns: str: The absolute HREF. @@ -349,7 +371,9 @@ def make_absolute_href( if parsed_source.scheme != "" or parsed_start.scheme != "": return _make_absolute_href_url(parsed_source, parsed_start, start_is_dir) else: - return _make_absolute_href_path(parsed_source, parsed_start, start_is_dir) + return _make_absolute_href_path( + parsed_source, parsed_start, start_is_dir, must_include_scheme + ) def is_absolute_href(href: str) -> bool: From 8446839a9c4c61d3f65c60e6b81fe2308e634c43 Mon Sep 17 00:00:00 2001 From: Julia Signell Date: Tue, 5 Nov 2024 16:55:54 -0500 Subject: [PATCH 2/2] Updating some tests to accomodate `file://` in self links --- tests/test_asset.py | 18 +++++++++--------- tests/test_catalog.py | 2 +- tests/test_collection.py | 2 +- tests/test_item.py | 6 +++--- tests/test_layout.py | 6 +++--- tests/test_link.py | 16 ++++++++++++---- tests/validation/test_validate.py | 4 +++- 7 files changed, 32 insertions(+), 22 deletions(-) diff --git a/tests/test_asset.py b/tests/test_asset.py index b41113403..165fddac7 100644 --- a/tests/test_asset.py +++ b/tests/test_asset.py @@ -21,9 +21,9 @@ def test_alter_asset_absolute_path( assert asset.get_absolute_href() == new_href assert os.path.exists(new_href) if action == "move": - assert not os.path.exists(old_href) + assert not os.path.exists(old_href.replace("file://", "")) elif action == "copy": - assert os.path.exists(old_href) + assert os.path.exists(old_href.replace("file://", "")) @pytest.mark.parametrize("action", ["copy", "move"]) @@ -38,11 +38,11 @@ def test_alter_asset_relative_path(action: str, tmp_asset: pystac.Asset) -> None assert asset.href == new_href href = asset.get_absolute_href() assert href is not None - assert os.path.exists(href) + assert os.path.exists(href.replace("file://", "")) if action == "move": - assert not os.path.exists(old_href) + assert not os.path.exists(old_href.replace("file://", "")) elif action == "copy": - assert os.path.exists(old_href) + assert os.path.exists(old_href.replace("file://", "")) @pytest.mark.parametrize("action", ["copy", "move"]) @@ -82,18 +82,18 @@ def test_delete_asset(tmp_asset: pystac.Asset) -> None: asset = tmp_asset href = asset.get_absolute_href() assert href is not None - assert os.path.exists(href) + assert os.path.exists(href.replace("file://", "")) asset.delete() - assert not os.path.exists(href) + assert not os.path.exists(href.replace("file://", "")) def test_delete_asset_relative_no_owner_fails(tmp_asset: pystac.Asset) -> None: asset = tmp_asset href = asset.get_absolute_href() assert href is not None - assert os.path.exists(href) + assert os.path.exists(href.replace("file://", "")) asset.owner = None @@ -101,4 +101,4 @@ def test_delete_asset_relative_no_owner_fails(tmp_asset: pystac.Asset) -> None: asset.delete() assert asset.href in str(e.value) - assert os.path.exists(href) + assert os.path.exists(href.replace("file://", "")) diff --git a/tests/test_catalog.py b/tests/test_catalog.py index cb8bdace1..799145299 100644 --- a/tests/test_catalog.py +++ b/tests/test_catalog.py @@ -541,7 +541,7 @@ def test_save_with_different_stac_io(self) -> None: assert len(hrefs) == stac_io.mock.write_text.call_count for call_args_list in stac_io.mock.write_text.call_args_list: - assert call_args_list[0][0] in hrefs + assert f"file://{call_args_list[0][0]}" in hrefs def test_subcatalogs_saved_to_correct_path(self) -> None: with tempfile.TemporaryDirectory() as tmp_dir: diff --git a/tests/test_collection.py b/tests/test_collection.py index c339a8d10..b5ac60a6b 100644 --- a/tests/test_collection.py +++ b/tests/test_collection.py @@ -691,7 +691,7 @@ def test_delete_asset_relative_no_self_link_fails( assert asset.href in str(e.value) assert name in collection.assets - assert os.path.exists(href) + assert os.path.exists(href.replace("file://", "")) def test_permissive_temporal_extent_deserialization(collection: Collection) -> None: diff --git a/tests/test_item.py b/tests/test_item.py index a1c39f12e..beed81a2d 100644 --- a/tests/test_item.py +++ b/tests/test_item.py @@ -92,11 +92,11 @@ def test_asset_absolute_href(self) -> None: item.set_self_href(item_path) rel_asset = Asset("./data.geojson") rel_asset.set_owner(item) - expected_href = make_posix_style( + expected_filepath = make_posix_style( os.path.abspath(os.path.join(os.path.dirname(item_path), "./data.geojson")) ) actual_href = rel_asset.get_absolute_href() - self.assertEqual(expected_href, actual_href) + self.assertEqual(f"file://{expected_filepath}", actual_href) def test_asset_absolute_href_no_item_self(self) -> None: item_dict = self.get_example_item_dict() @@ -612,7 +612,7 @@ def test_delete_asset_relative_no_self_link_fails(tmp_asset: pystac.Asset) -> No assert asset.href in str(e.value) assert name in item.assets - assert os.path.exists(href) + assert os.path.exists(href.replace("file://", "")) def test_resolve_collection_with_root( diff --git a/tests/test_layout.py b/tests/test_layout.py index ff27f76b2..2816254de 100644 --- a/tests/test_layout.py +++ b/tests/test_layout.py @@ -421,7 +421,7 @@ def test_catalog(self) -> None: href = self.strategy.get_href( cat, parent_dir="https://example.com", is_root=True ) - self.assertEqual(href, "/an/href") + self.assertEqual(href, "file:///an/href") def test_collection(self) -> None: collection = TestCases.case_8() @@ -434,7 +434,7 @@ def test_collection(self) -> None: href = self.strategy.get_href( collection, parent_dir="https://example.com", is_root=True ) - self.assertEqual(href, "/an/href") + self.assertEqual(href, "file:///an/href") def test_item(self) -> None: collection = TestCases.case_8() @@ -444,7 +444,7 @@ def test_item(self) -> None: self.strategy.get_href(item, parent_dir="http://example.com") item.set_self_href("/an/href") href = self.strategy.get_href(item, parent_dir="http://example.com") - self.assertEqual(href, "/an/href") + self.assertEqual(href, "file:///an/href") class APILayoutStrategyTest(unittest.TestCase): diff --git a/tests/test_link.py b/tests/test_link.py index c7ecdb6cc..7d8fe5579 100644 --- a/tests/test_link.py +++ b/tests/test_link.py @@ -107,7 +107,9 @@ def test_resolved_self_href(self) -> None: link = catalog.get_single_link(pystac.RelType.SELF) assert link link.resolve_stac_object() - self.assertEqual(link.get_absolute_href(), make_posix_style(path)) + self.assertEqual( + link.get_absolute_href(), f"file://{make_posix_style(path)}" + ) def test_target_getter_setter(self) -> None: link = pystac.Link("my rel", target="./foo/bar.json") @@ -140,7 +142,10 @@ def test_relative_self_href(self) -> None: item = pystac.read_file("item.json") href = item.get_self_href() assert href - self.assertTrue(os.path.isabs(href), f"Not an absolute path: {href}") + self.assertTrue( + os.path.isabs(href.replace("file://", "")), + f"Not an absolute path: {href}", + ) finally: os.chdir(previous) @@ -237,7 +242,10 @@ def test_from_dict_round_trip(self) -> None: d2 = pystac.Link.from_dict(d).to_dict() self.assertEqual(d, d2) d = {"rel": "self", "href": "t"} - d2 = {"rel": "self", "href": make_posix_style(os.path.join(os.getcwd(), "t"))} + d2 = { + "rel": "self", + "href": f"file://{make_posix_style(os.path.join(os.getcwd(), 't'))}", + } self.assertEqual(pystac.Link.from_dict(d).to_dict(), d2) def test_from_dict_failures(self) -> None: @@ -333,7 +341,7 @@ def test_relative_self_link(tmp_path: Path) -> None: assert read_item asset_href = read_item.assets["data"].get_absolute_href() assert asset_href - assert Path(asset_href).exists() + assert Path(asset_href.replace("file://", "")).exists() @pytest.mark.parametrize("rel", HIERARCHICAL_LINKS) diff --git a/tests/validation/test_validate.py b/tests/validation/test_validate.py index d3f5dd416..1d92ef4ff 100644 --- a/tests/validation/test_validate.py +++ b/tests/validation/test_validate.py @@ -125,7 +125,9 @@ def test_validate_all_dict(self, test_case: pystac.Catalog) -> None: dst_dir = os.path.join(tmp_dir, "catalog") # Copy test case 7 to the temporary directory catalog_href = get_opt(TestCases.case_7().get_self_href()) - shutil.copytree(os.path.dirname(catalog_href), dst_dir) + shutil.copytree( + os.path.dirname(catalog_href.replace("file://", "")), dst_dir + ) new_cat_href = os.path.join(dst_dir, "catalog.json")