stash

nilsleh · nilsleh · commit 5d65a7f3d9e9 · 2025-01-23T15:12:37.000Z
diff --git a/torchgeo/datasets/__init__.py b/torchgeo/datasets/__init__.py
@@ -55,6 +55,7 @@
     NonGeoClassificationDataset,
     NonGeoDataset,
     RasterDataset,
+    RioXarrayDataset,
     UnionDataset,
     VectorDataset,
 )
@@ -101,7 +102,6 @@
 from .quakeset import QuakeSet
 from .reforestree import ReforesTree
 from .resisc45 import RESISC45
-from .rioxr import RioXarrayDataset
 from .rwanda_field_boundary import RwandaFieldBoundary
 from .satlas import SatlasPretrain
 from .seasonet import SeasoNet
@@ -258,6 +258,7 @@
     'RGBBandsMissingError',
     'RasterDataset',
     'ReforesTree',
+    'RioXarrayDataset',
     'RwandaFieldBoundary',
     'SSL4EOLBenchmark',
     'SatlasPretrain',
diff --git a/torchgeo/datasets/geo.py b/torchgeo/datasets/geo.py
@@ -14,6 +14,11 @@
 from collections.abc import Callable, Iterable, Sequence
 from typing import Any, ClassVar, cast
 
+import xarray as xr
+from rasterio.crs import CRS
+from rioxarray.merge import merge_arrays
+from rtree.index import Index, Property
+
 import fiona
 import fiona.transform
 import numpy as np
@@ -1238,3 +1243,260 @@ def res(self, new_res: float) -> None:
         self._res = new_res
         self.datasets[0].res = new_res
         self.datasets[1].res = new_res
+
+
+
+class RioXarrayDataset(GeoDataset):
+    """Wrapper for geographical datasets stored as Xarray Datasets.
+    
+    In-memory geographical xarray.DataArray and xarray.Dataset.
+
+    Relies on rioxarray.
+
+    .. versionadded:: 0.7.0
+    """
+
+    filename_glob = "*"
+    filename_regex = ".*"
+
+    is_image = True
+
+    spatial_x_name = "x"
+    spatial_y_name = "y"
+
+    transform = None
+
+    @property
+    def dtype(self) -> torch.dtype:
+        """The dtype of the dataset (overrides the dtype of the data file via a cast).
+
+        Returns:
+            the dtype of the dataset
+        """
+        if self.is_image:
+            return torch.float32
+        else:
+            return torch.long
+
+    def harmonize_format(self, ds):
+        """Convert the dataset to the standard format.
+
+        Args:
+            ds: dataset or array to harmonize
+
+        Returns:
+            the harmonized dataset or array
+        """
+        # rioxarray expects spatial dimensions to be named x and y
+        ds.rio.set_spatial_dims(self.spatial_x_name, self.spatial_y_name, inplace=True)
+
+        # if x coords go from 0 to 360, convert to -180 to 180
+        if ds[self.spatial_x_name].min() > 180:
+            ds = ds.assign_coords(
+                {self.spatial_x_name: ds[self.spatial_x_name] % 360 - 180}
+            )
+
+        # if y coords go from 0 to 180, convert to -90 to 90
+        if ds[self.spatial_x_name].min() > 90:
+            ds = ds.assign_coords(
+                {self.spatial_y_name: ds[self.spatial_y_name] % 180 - 90}
+            )
+        # expect asceding coordinate values
+        ds = ds.sortby(self.spatial_x_name, ascending=True)
+        ds = ds.sortby(self.spatial_y_name, ascending=True)
+        return ds
+
+    def __init__(
+        self,
+        paths: Path | Iterable[Path] = 'data',
+        data_variables: list[str] | None = None,
+        # crs: Optional[CRS] = None,
+        transforms: Callable[[dict[str, Any]], dict[str, Any]] | None = None,
+    ) -> None:
+        """Initialize a new Dataset instance.
+
+        Args:
+            paths: one or more root directories to search or files to load
+            data_variables: data variables that should be gathered from the collection
+                of xarray datasets
+            transforms: a function/transform that takes an input sample
+                and returns a transformed version
+
+        Raises:
+            FileNotFoundError: if files are not found in ``paths``
+        """
+        super().__init__(transforms)
+
+        self.paths = paths
+
+        if data_variables:
+            self.data_variables = data_variables
+        else:
+            data_variables_to_collect: list[str] = []
+
+        self.transforms = transforms
+
+        # Create an R-tree to index the dataset
+        self.index = Index(interleaved=False, properties=Property(dimension=3))
+
+        # Populate the dataset index
+        i = 0
+        pathname = os.path.join(root, self.filename_glob)
+        filename_regex = re.compile(self.filename_regex, re.VERBOSE)
+        for filepath in glob.iglob(pathname, recursive=True):
+            match = re.match(filename_regex, os.path.basename(filepath))
+            if match is not None:
+                with xr.open_dataset(filepath, decode_times=True) as ds:
+                    ds = self.harmonize_format(ds)
+
+                    try:
+                        (minx, miny, maxx, maxy) = ds.rio.bounds()
+                    except AttributeError:
+                        # or take the shape of the data variable?
+                        continue
+
+                if hasattr(ds, "time"):
+                    try:
+                        indices = ds.indexes["time"].to_datetimeindex()
+                    except AttributeError:
+                        indices = ds.indexes["time"]
+
+                    mint = indices.min().to_pydatetime().timestamp()
+                    maxt = indices.max().to_pydatetime().timestamp()
+                else:
+                    mint = 0
+                    maxt = sys.maxsize
+                coords = (minx, maxx, miny, maxy, mint, maxt)
+                self.index.insert(i, coords, filepath)
+                i += 1
+
+                # collect all possible data variables if self.data_variables is None
+                if not data_variables:
+                    data_variables_to_collect.extend(list(ds.data_vars))
+
+        if i == 0:
+            import pdb
+
+            pdb.set_trace()
+            msg = f"No {self.__class__.__name__} data was found in `paths='{self.paths}'`"
+            raise FileNotFoundError(msg)
+
+        if not data_variables:
+            self.data_variables = list(set(data_variables_to_collect))
+
+        # if not crs:
+        #     self._crs = "EPSG:4326"
+        # else:
+        #     self._crs = cast(CRS, crs)
+        self.res = 1.0
+
+    def _infer_spatial_coordinate_names(self, ds) -> tuple[str]:
+        """Infer the names of the spatial coordinates.
+
+        Args:
+            ds: Dataset or DataArray of which to infer the spatial coordinates
+
+        Returns:
+            x and y coordinate names
+        """
+        x_name = None
+        y_name = None
+        for coord_name, coord in ds.coords.items():
+            if hasattr(coord, "units"):
+                if any(
+                    [
+                        x in coord.units.lower()
+                        for x in ["degrees_north", "degree_north"]
+                    ]
+                ):
+                    y_name = coord_name
+                elif any(
+                    [x in coord.units.lower() for x in ["degrees_east", "degree_east"]]
+                ):
+                    x_name = coord_name
+
+        if not x_name or not y_name:
+            raise ValueError("Spatial Coordinate Units not found in Dataset.")
+
+        return x_name, y_name
+
+    def __getitem__(self, query: BoundingBox) -> dict[str, Any]:
+        """Retrieve image/mask and metadata indexed by query.
+
+        Args:
+            query: (minx, maxx, miny, maxy, mint, maxt) coordinates to index
+
+        Returns:
+            sample of image/mask and metadata at that index
+
+        Raises:
+            IndexError: if query is not found in the index
+        """
+        hits = self.index.intersection(tuple(query), objects=True)
+        items = [hit.object for hit in hits]
+
+        if not items:
+            raise IndexError(
+                f"query: {query} not found in index with bounds: {self.bounds}"
+            )
+
+        data_arrays: list["np.typing.NDArray"] = []
+        for item in items:
+            with xr.open_dataset(item, decode_cf=True) as ds:
+                ds = self.harmonize_format(ds)
+                # select time dimension
+                if hasattr(ds, "time"):
+                    try:
+                        ds["time"] = ds.indexes["time"].to_datetimeindex()
+                    except AttributeError:
+                        ds["time"] = ds.indexes["time"]
+                    ds = ds.sel(
+                        time=slice(
+                            datetime.fromtimestamp(query.mint),
+                            datetime.fromtimestamp(query.maxt),
+                        )
+                    )
+
+                for variable in self.data_variables:
+                    if hasattr(ds, variable):
+                        da = ds[variable]
+                        # if not da.rio.crs:
+                        #     da.rio.write_crs(self._crs, inplace=True)
+                        # elif da.rio.crs != self._crs:
+                        #     da = da.rio.reproject(self._crs)
+                        # clip box ignores time dimension
+                        clipped = da.rio.clip_box(
+                            minx=query.minx,
+                            miny=query.miny,
+                            maxx=query.maxx,
+                            maxy=query.maxy,
+                        )
+                        # rioxarray expects this order
+                        clipped = clipped.transpose(
+                            "time", self.spatial_y_name, self.spatial_x_name, ...
+                        )
+
+                        # set proper transform # TODO not working
+                        # clipped.rio.write_transform(self.transform)
+                        data_arrays.append(clipped.squeeze())
+
+        import pdb
+
+        pdb.set_trace()
+        merged_data = torch.from_numpy(
+            merge_arrays(
+                data_arrays, bounds=(query.minx, query.miny, query.maxx, query.maxy)
+            ).data
+        )
+        sample = {"bbox": query}
+
+        merged_data = merged_data.to(self.dtype)
+        if self.is_image:
+            sample["image"] = merged_data
+        else:
+            sample["mask"] = merged_data
+
+        if self.transforms is not None:
+            sample = self.transforms(sample)
+
+        return sample
diff --git a/torchgeo/datasets/rioxr.py b/torchgeo/datasets/rioxr.py