From 4dde9cad3a5d20bc5582785d1a83e9e0e064b494 Mon Sep 17 00:00:00 2001 From: Bernard Pazio Date: Mon, 3 Oct 2022 13:48:45 +0100 Subject: [PATCH 1/6] int64-hilbert-distance set hilbert distance to dtype to int64 --- dask_geopandas/core.py | 2 +- dask_geopandas/hilbert_distance.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/dask_geopandas/core.py b/dask_geopandas/core.py index 25e2c6db..b8b9ab75 100644 --- a/dask_geopandas/core.py +++ b/dask_geopandas/core.py @@ -389,7 +389,7 @@ def hilbert_distance(self, total_bounds=None, level=16): _hilbert_distance, total_bounds=total_bounds, level=level, - meta=pd.Series([], name="hilbert_distance", dtype="uint32"), + meta=pd.Series([], name="hilbert_distance", dtype=np.int64), ) return distances diff --git a/dask_geopandas/hilbert_distance.py b/dask_geopandas/hilbert_distance.py index b8d5a752..0854e163 100644 --- a/dask_geopandas/hilbert_distance.py +++ b/dask_geopandas/hilbert_distance.py @@ -43,7 +43,7 @@ def _hilbert_distance(gdf, total_bounds=None, level=16): # Compute distance along hilbert curve distances = _encode(level, x, y) - return pd.Series(distances, index=gdf.index, name="hilbert_distance") + return pd.Series(distances, index=gdf.index, name="hilbert_distance", dtype=np.int64) def _continuous_to_discrete_coords(bounds, level, total_bounds): From b4ec04f4c4cf6813eccaed28b6a1690ec4a1e41d Mon Sep 17 00:00:00 2001 From: Bernard Pazio Date: Mon, 3 Oct 2022 15:15:12 +0100 Subject: [PATCH 2/6] int64-hilbert-distance add test --- .../tests/test_spatial_partitioning.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/dask_geopandas/tests/test_spatial_partitioning.py b/dask_geopandas/tests/test_spatial_partitioning.py index 7c7d8a34..7f69689c 100644 --- a/dask_geopandas/tests/test_spatial_partitioning.py +++ b/dask_geopandas/tests/test_spatial_partitioning.py @@ -1,9 +1,12 @@ +import numpy as np import pytest import geopandas from geopandas.testing import assert_geodataframe_equal, assert_geoseries_equal +from shapely.geometry import Point import dask_geopandas +from dask_geopandas.hilbert_distance import _hilbert_distance def test_propagate_on_geometry_access(): @@ -60,3 +63,18 @@ def test_cx(): assert len(subset) == 0 expected = df.cx[-200:-190, 300:400] assert_geodataframe_equal(subset.compute(), expected) + + +def test_geopandas_handles_large_hilbert_distances(): + df = geopandas.GeoDataFrame( + { + 'geometry': [Point(-103152.516, -8942.156), Point(118914.500, 1010032.562)] + } + ) + + # make sure we have values greater than 32bits + dist = _hilbert_distance(df) + assert ((dist > np.iinfo(np.int32).max) | (dist < np.iinfo(np.int32).min)).any() + + ddf = dask_geopandas.from_geopandas(df, npartitions=1) + ddf.spatial_shuffle() From 39d848b32b266ec672b452a0171c06fcab7d1322 Mon Sep 17 00:00:00 2001 From: Bernard Pazio Date: Mon, 3 Oct 2022 15:28:57 +0100 Subject: [PATCH 3/6] int64-hilbert-distance fix formatting --- dask_geopandas/tests/test_spatial_partitioning.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/dask_geopandas/tests/test_spatial_partitioning.py b/dask_geopandas/tests/test_spatial_partitioning.py index 7f69689c..ea7176a4 100644 --- a/dask_geopandas/tests/test_spatial_partitioning.py +++ b/dask_geopandas/tests/test_spatial_partitioning.py @@ -66,11 +66,7 @@ def test_cx(): def test_geopandas_handles_large_hilbert_distances(): - df = geopandas.GeoDataFrame( - { - 'geometry': [Point(-103152.516, -8942.156), Point(118914.500, 1010032.562)] - } - ) + df = geopandas.GeoDataFrame({'geometry': [Point(-103152.516, -8942.156), Point(118914.500, 1010032.562)]}) # make sure we have values greater than 32bits dist = _hilbert_distance(df) From 7d7baec37ba3e62a39251c0bfb0909f0e9c8fcf0 Mon Sep 17 00:00:00 2001 From: Bernard Pazio Date: Mon, 3 Oct 2022 15:30:53 +0100 Subject: [PATCH 4/6] int64-hilbert-distance fix formatting --- dask_geopandas/tests/test_spatial_partitioning.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/dask_geopandas/tests/test_spatial_partitioning.py b/dask_geopandas/tests/test_spatial_partitioning.py index ea7176a4..f1b7c18e 100644 --- a/dask_geopandas/tests/test_spatial_partitioning.py +++ b/dask_geopandas/tests/test_spatial_partitioning.py @@ -66,7 +66,9 @@ def test_cx(): def test_geopandas_handles_large_hilbert_distances(): - df = geopandas.GeoDataFrame({'geometry': [Point(-103152.516, -8942.156), Point(118914.500, 1010032.562)]}) + df = geopandas.GeoDataFrame( + {'geometry': [Point(-103152.516, -8942.156), Point(118914.500, 1010032.562)]} + ) # make sure we have values greater than 32bits dist = _hilbert_distance(df) From 6ae2f5e64245f554bf3e21282c37b66533c6628c Mon Sep 17 00:00:00 2001 From: Bernard Pazio Date: Mon, 3 Oct 2022 15:31:40 +0100 Subject: [PATCH 5/6] int64-hilbert-distance fix formatting --- dask_geopandas/tests/test_spatial_partitioning.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dask_geopandas/tests/test_spatial_partitioning.py b/dask_geopandas/tests/test_spatial_partitioning.py index f1b7c18e..f33ec813 100644 --- a/dask_geopandas/tests/test_spatial_partitioning.py +++ b/dask_geopandas/tests/test_spatial_partitioning.py @@ -67,7 +67,7 @@ def test_cx(): def test_geopandas_handles_large_hilbert_distances(): df = geopandas.GeoDataFrame( - {'geometry': [Point(-103152.516, -8942.156), Point(118914.500, 1010032.562)]} + {"geometry": [Point(-103152.516, -8942.156), Point(118914.500, 1010032.562)]} ) # make sure we have values greater than 32bits From 74183dd12764930c4a9690e77339e7885e0fa5b4 Mon Sep 17 00:00:00 2001 From: Bernard Pazio Date: Mon, 3 Oct 2022 15:32:15 +0100 Subject: [PATCH 6/6] int64-hilbert-distance fix formatting --- dask_geopandas/hilbert_distance.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/dask_geopandas/hilbert_distance.py b/dask_geopandas/hilbert_distance.py index 0854e163..8bb597ea 100644 --- a/dask_geopandas/hilbert_distance.py +++ b/dask_geopandas/hilbert_distance.py @@ -43,7 +43,9 @@ def _hilbert_distance(gdf, total_bounds=None, level=16): # Compute distance along hilbert curve distances = _encode(level, x, y) - return pd.Series(distances, index=gdf.index, name="hilbert_distance", dtype=np.int64) + return pd.Series( + distances, index=gdf.index, name="hilbert_distance", dtype=np.int64 + ) def _continuous_to_discrete_coords(bounds, level, total_bounds):