Merge pull request #320 from martinfleis/simpson

REF: simplify calculation of Simpson diversity
pysal · Dec 23, 2021 · 90ce789 · 90ce789
2 parents 8d341e5 + 1b6a816
commit 90ce789
Show file tree

Hide file tree

Showing 2 changed files with 12 additions and 30 deletions.
diff --git a/momepy/diversity.py b/momepy/diversity.py
@@ -247,8 +247,6 @@ class Simpson:
         return Inverse Simpson index instead of Simpson index (``1 / λ``)
     categorical : bool (default False)
         treat values as categories (will not use ``binning``)
-    categories : list-like (default None)
-        list of categories. If None ``values.unique()`` is used.
     verbose : bool (default True)
         if True, shows progress bars in loops and indication of steps
     **classification_kwds : dict
@@ -315,7 +313,6 @@ def __init__(
         self.gini_simpson = gini_simpson
         self.inverse = inverse
         self.categorical = categorical
-        self.categories = categories
         self.classification_kwds = classification_kwds
 
         data = gdf.copy()
@@ -327,13 +324,10 @@ def __init__(
 
         data = data.set_index(unique_id)[values]
 
-        if not categories:
-            categories = data.unique()
-
         if not categorical:
             self.bins = classify(data, scheme=binning, **classification_kwds).bins
         else:
-            self.bins = categories
+            self.bins = None
 
         results_list = []
         for index in tqdm(data.index, total=data.shape[0], disable=not verbose):
@@ -347,7 +341,6 @@ def __init__(
                         values_list,
                         self.bins,
                         categorical=categorical,
-                        categories=categories,
                     )
                 )
             else:
@@ -361,7 +354,7 @@ def __init__(
             self.series = pd.Series(results_list, index=gdf.index)
 
 
-def simpson_diversity(data, bins=None, categorical=False, categories=None):
+def simpson_diversity(values, bins=None, categorical=False):
     """
     Calculates the Simpson\'s diversity index of data. Helper function for
     :py:class:`momepy.Simpson`.
@@ -370,18 +363,16 @@ def simpson_diversity(data, bins=None, categorical=False, categories=None):
 
         \\lambda=\\sum_{i=1}^{R} p_{i}^{2}
 
-    Formula adapted from https://gist.github.com/martinjc/f227b447791df8c90568.
 
     Parameters
     ----------
-    data : GeoDataFrame
-        GeoDataFrame containing morphological tessellation
+    values : pandas.Series
+        list of values
     bins : array, optional
-        array of top edges of classification bins. Result of binnng.bins.
+        array of top edges of classification bins.
+        Should be equalt to the result of binnng.bins.
     categorical : bool (default False)
         treat values as categories (will not use ``bins``)
-    categories : list-like (default None)
-        list of categories
 
     Returns
     -------
@@ -398,24 +389,16 @@ def simpson_diversity(data, bins=None, categorical=False, categories=None):
         except ImportError:
             raise ImportError("The 'mapclassify' package is required")
 
-    def p(n, N):
-        """Relative abundance"""
-        if n == 0:
-            return 0
-        return float(n) / N
-
     if categorical:
-        counts = data.value_counts().to_dict()
-        for c in categories:
-            if c not in counts.keys():
-                counts[c] = 0
+        counts = values.value_counts()
+
     else:
-        sample_bins = mc.UserDefined(data, bins)
-        counts = dict(zip(bins, sample_bins.counts))
+        sample_bins = mc.UserDefined(values, bins)
+        counts = sample_bins.counts
 
-    N = sum(counts.values())
+    N = sum(counts)
 
-    return sum(p(n, N) ** 2 for n in counts.values() if n != 0)
+    return sum((n / N) ** 2 for n in counts if n != 0)
 
 
 class Gini:

diff --git a/tests/test_diversity.py b/tests/test_diversity.py
@@ -99,7 +99,6 @@ def test_Simpson(self):
             self.sw,
             "uID",
             categorical=True,
-            categories=range(15),
         ).series
         assert cat2[0] == pytest.approx(0.15)