diff --git a/ci/310-numba-oldest.yaml b/ci/310-numba-oldest.yaml index 6850f71d..e3c93e47 100644 --- a/ci/310-numba-oldest.yaml +++ b/ci/310-numba-oldest.yaml @@ -22,3 +22,4 @@ dependencies: - pytest - pytest-cov - pytest-xdist + - geodatasets diff --git a/ci/310-oldest.yaml b/ci/310-oldest.yaml index 66b5b088..f1420ea8 100644 --- a/ci/310-oldest.yaml +++ b/ci/310-oldest.yaml @@ -21,3 +21,4 @@ dependencies: - pytest - pytest-cov - pytest-xdist + - geodatasets diff --git a/ci/311-latest.yaml b/ci/311-latest.yaml index 7888d150..f8b49a52 100644 --- a/ci/311-latest.yaml +++ b/ci/311-latest.yaml @@ -20,3 +20,4 @@ dependencies: - pytest-xdist # optional - rtree + - geodatasets diff --git a/ci/311-numba-latest.yaml b/ci/311-numba-latest.yaml index 88685973..9692d717 100644 --- a/ci/311-numba-latest.yaml +++ b/ci/311-numba-latest.yaml @@ -19,6 +19,7 @@ dependencies: - pytest - pytest-cov - pytest-xdist + - geodatasets # optional - numba - rtree diff --git a/ci/312-dev.yaml b/ci/312-dev.yaml index 055015d3..ad1c8143 100644 --- a/ci/312-dev.yaml +++ b/ci/312-dev.yaml @@ -8,6 +8,7 @@ dependencies: # optional - rtree # testing + - geodatasets - codecov - folium - mapclassify diff --git a/ci/312-latest.yaml b/ci/312-latest.yaml index d3df3de2..d197fed9 100644 --- a/ci/312-latest.yaml +++ b/ci/312-latest.yaml @@ -12,6 +12,7 @@ dependencies: - scipy - shapely # testing + - geodatasets - codecov - folium - mapclassify diff --git a/ci/312-min.yaml b/ci/312-min.yaml index 0c0451f0..921899bb 100644 --- a/ci/312-min.yaml +++ b/ci/312-min.yaml @@ -4,6 +4,7 @@ channels: dependencies: - python=3.12 # required + - geodatasets - geopandas - libpysal>=4.12 - numpy diff --git a/ci/312-numba-dev.yaml b/ci/312-numba-dev.yaml index 1b425531..2f3af33f 100644 --- a/ci/312-numba-dev.yaml +++ b/ci/312-numba-dev.yaml @@ -9,6 +9,7 @@ dependencies: - numba - rtree # testing + - geodatasets - codecov - folium - mapclassify diff --git a/ci/312-numba-latest.yaml b/ci/312-numba-latest.yaml index 00e82525..7fd99d74 100644 --- a/ci/312-numba-latest.yaml +++ b/ci/312-numba-latest.yaml @@ -12,6 +12,7 @@ dependencies: - scipy - shapely # testing + - geodatasets - codecov - folium - mapclassify diff --git a/docs/_static/references.bib b/docs/_static/references.bib index ec783b45..dd413bf4 100644 --- a/docs/_static/references.bib +++ b/docs/_static/references.bib @@ -293,4 +293,14 @@ @article{ab_gl_vm2020joue issn = {0094-1190}, doi = {10.1016/j.jue.2019.103217}, author={Arribas-Bel, Daniel and Garcia-L{\'o}pez, M-{\`A} and Viladecans-Marsal, Elisabet}, - } \ No newline at end of file +} + +@article{wolf2024confounded, + title={{Confounded Local Inference:} Extending Local Moran Statistics to Handle Confounding}, + author={Wolf, Levi John}, + year={2024}, + number={in press}, + volume={in press}, + pages={0-0}, + journal={Annals of the American Association of Geographers} +} diff --git a/esda/moran_local_mv.py b/esda/moran_local_mv.py new file mode 100644 index 00000000..b3b09da5 --- /dev/null +++ b/esda/moran_local_mv.py @@ -0,0 +1,434 @@ +import numpy as np +import esda +from libpysal.weights import lag_spatial +from libpysal.graph import Graph + +try: + from tqdm import tqdm +except ImportError: + + def tqdm(x, **kwargs): + return x + + +def _calc_quad(x,y): + """ + This is a simpler solution to calculate a cartesian quadrant. + + To explain graphically, let the tuple below be (off_sign[i], neg_y[i]*2). + + If sign(x[i]) != sign(y[i]), we are on the negative diagonal. + If y is negative, we are on the bottom of the plot. + + Therefore, the sum (off_sign + neg_y*2 + 1) gives you the cartesian quadrant. + + II | I + 1,0 | 0,0 + -----+----- + 0,2 | 1,2 + III | IV + + """ + off_sign = np.sign(x) != np.sign(y) + neg_y = (y<0) + return off_sign + neg_y*2 + 1 + +class Partial_Moran_Local(object): + def __init__( + self, permutations=999, unit_scale=True, partial_labels=True + ): + """ + Compute the Multivariable Local Moran statistics under partial dependence, as defined by :cite:`wolf2024confounded` + + Arguments + --------- + permutations : int + the number of permutations to run for the inference, + driven by conditional randomization. + unit_scale : bool + whether to enforce unit variance in the local statistics. This + normalizes the variance of the data at inupt, ensuring that + the covariance statistics are not overwhelmed by any single + covariate's large variance. + partial_labels : bool + whether to calculate the classification based on the part-regressive + quadrant classification or the univariate quadrant classification, + like a classical Moran's I. When mvquads is True, the variables are labelled as: + - label 1: observations with large y - rho * x that also have large Wy values. + - label 2: observations with small y - rho * x values that also have large Wy values. + - label 3: observations with small y - rho * x values that also have small Wy values. + - label 4: observations with large y - rho * x values that have small Wy values. + Defaults to part-regressive quadrants + + Attributes + ---------- + connectivity : The weights matrix inputted, but row standardized + D : The "design" matrix used in computation. If X is + not None, this will be [1 y X] + R : the "response" matrix used in computation. Will + always be the same shape as D and contain [1, Wy, Wy, ....] + DtDi : empirical parameter covariance matrix + the P x P matrix describing the variance and covariance + of y and X. + P : the number of parameters. 1 if X is not provided. + lmos_ : the N,P matrix of multivariable LISA statistics. + the first column, lmos[:,1] is the LISAs corresponding + to the relationship between Wy and y conditioning on X. + rlmos_ : the (N, permutations, P+1) realizations from the conditional + randomization to generate reference distributions for + each Local Moran statistic. rlmos_[:,:,1] pertain to + the reference distribution of y and Wy. + quads_ : the (N, P) matrix of quadrant classifications for the + part-regressive relationships. quads[:,0] pertains to + the relationship between y and Wy. The mean is not classified, + since it's just binary above/below mean usually. + partials_: the (N,2,P+1) matrix of part-regressive contributions. + The ith slice of partials_[:,:,i] contains the + partial regressive contribution of that covariate, with + the first column indicating the part-regressive outcome + and the second indicating the part-regressive design. + The partial regression matrix starts at zero, so + partials_[:,:,0] corresponds to the partial regression + describing the relationship between y and Wy. + """ + self.permutations = permutations + self.unit_scale = unit_scale + self.partial_labels = partial_labels + + def fit(self, X, y, W): + """ + Fit the partial local Moran statistic on input data + + Parameters + ---------- + X : (N,p) array + array of data that is used as "confounding factors" + to account for their covariance with Y. + y : (N,1) array + array of data that is the targeted "outcome" covariate + to compute the multivariable Moran's I + W : (N,N) weights object + a PySAL weights object. Immediately row-standardized. + + Returns + ------- + self : object + this Partial_Moran_Local() statistic after fitting to data + """ + y = np.asarray(y).reshape(-1, 1) + if isinstance(W, Graph): + W = W.transform("R") + else: + W.transform = "r" + y = y - y.mean() + if self.unit_scale: + y /= y.std() + X = X - X.mean(axis=0) + if self.unit_scale: + X = X / X.std(axis=0) + self.y = y + self.X = X + D, R = self._make_data(y, X, W) + self.D, self.R = D, R + self.P = D.shape[1] - 1 + self.N = W.n + self.DtDi = np.linalg.inv( + self.D.T @ self.D + ) # this is only PxP, so not too bad... + self._left_component_ = (self.D @ self.DtDi) * (self.N - 1) + self._lmos_ = self._left_component_ * self.R + self.connectivity = W + self.permutations = self.permutations + if self.permutations is not None: # NOQA necessary to avoid None > 0 + if self.permutations > 0: + self._crand(y, X, W) + + + self._rlmos_ *= self.N - 1 + self._p_sim_ = np.zeros((self.N, self.P + 1)) + # TODO: this should be changed to the general p-value framework + for permutation in range(self.permutations): + self._p_sim_ += ( + self._rlmos_[:, permutation, :] < self._lmos_ + ).astype(int) + self._p_sim_ /= self.permutations + self._p_sim_ = np.minimum(self._p_sim_, 1 - self._p_sim_) + + component_quads = [] + for i, left in enumerate(self._left_component_.T): + right = self.R[:, i] + quads = _calc_quad(left - left.mean(), right) + component_quads.append(quads) + self._partials_ = np.asarray( + [ + np.vstack((left, right)).T + for left, right in zip(self._left_component_.T, self.R.T) + ] + ) + + uvquads = [] + negative_lag = R[:,1] < 0 + for i, x_ in enumerate(self.D.T): + if i == 0: + continue + off_sign = np.sign(x_) != np.sign(R[:,1]) + quads = negative_lag.astype(int).flatten() * 2 + off_sign.astype(int) + 1 + uvquads.append(quads.flatten()) + + self._uvquads_ = np.row_stack(uvquads).T + self._mvquads_ = np.row_stack(component_quads).T + return self + + def _make_data(self, z, X, W): + if isinstance(W, Graph): # NOQA because ternary is confusing + Wz = W.lag(z) + else: + Wz = lag_spatial(W, z) + if X is not None: + D = np.hstack((np.ones(z.shape), z, X)) + P = X.shape[1] + 1 + else: + D = np.hstack((np.ones(z.shape), z)) + P = 1 + R = np.tile(Wz, P + 1) + return D, R + # self.D, self.R = D, R + + def _crand(self, y, X, W): + N = W.n + N_permutations = self.permutations + prange = range(N_permutations) + if isinstance(W, Graph): + max_neighbs = W.cardinalities.max() + 1 + else: + max_neighbs = W.max_neighbors + 1 + pre_permutations = np.array( + [np.random.permutation(N - 1)[0:max_neighbs] for i in prange] + ) + straight_ids = np.arange(N) + if isinstance(W, Graph): # NOQA + id_order = W.unique_ids + else: + id_order = W.id_order + DtDi = self.DtDi + ordered_weights = [W.weights[id_order[i]] for i in straight_ids] + ordered_cardinalities = [W.cardinalities[id_order[i]] for i in straight_ids] + lmos = np.empty((N, N_permutations, self.P + 1)) + for i in tqdm(range(N), desc="Simulating by site"): + ids_noti = straight_ids[straight_ids != i] + np.random.shuffle(ids_noti) + these_permutations = pre_permutations[:, 0 : ordered_cardinalities[i]] + randomized_permutations = ids_noti[these_permutations] + shuffled_ys = y[randomized_permutations] + these_weights = np.asarray(ordered_weights[i]).reshape(-1, 1) + shuffled_Wyi = (shuffled_ys * these_weights).sum( + axis=1 + ) # these are N-permutations by 1 now + # shuffled_X = X[randomized_permutations, :] + # #these are still N-permutations, N-neighbs, N-covariates + if X is None: + local_data = np.array((1, y[i].item())).reshape(1, -1) + shuffled_D = np.tile( + local_data, N_permutations + ).T # now N permutations by P + else: + local_data = np.array((1, y[i].item(), *X[i])).reshape(-1, 1) + shuffled_D = np.tile( + local_data, N_permutations + ).T # now N permutations by P + shuffled_R = np.tile(shuffled_Wyi, self.P + 1) + lmos[i] = (shuffled_R * shuffled_D) @ DtDi + self._rlmos_ = lmos # nobs, nperm, nvars + + @property + def associations_(self): + """ + The association between y and the local average of y, + removing the correlation due to x and the local average of y + """ + return self._lmos_[:, 1] + + @property + def significances_(self): + """ + The pseudo-p-value built using map randomization for the + structural relationship between y and its local average, + removing the correlation due to the relationship between x + and the local average of y. + """ + return self._p_sim_[:, 1] + + @property + def partials_(self): + """ + The components of the local statistic. The first column is the + structural exogenous component of the data, and the second is the + local average of y. + """ + return self._partials_[1] + + @property + def reference_distribution_(self): + """ + Simulated distribution of associations_, assuming that there is + - no structural relationship between y and its local average; + - the same observed structural relationship between y and x. + """ + return self._rlmos_[:, :, 1] + + @property + def labels_(self): + """ + The classifications (in terms of cluster-type and outlier-type) + for the associations_ statistics. If the quads requested are + *mvquads*, then the classification is done with respect to the + left and right components (first and second columns of partials_). + + If the quads requested are *uvquads*, then this will only be computed + with respect to the outcome and the local average. + The cluster typology is: + - 1: above-average left component (either y or D @ DtDi), + above-average right component (local average of y) + - 2: below-average left component (either y or D @ DtDi), + above-average right component (local average of y) + - 3: below-average left component (either y or D @ DtDi) + below-average right component (local average of y) + - 4: above-average left component (either y or D @ DtDi) + below-average right component (local average of y) + """ + if self.partial_labels: + return self._mvquads_[:, 1] + else: + return self._uvquads_[:, 1] + + +class Auxiliary_Moran_Local(esda.Moran_Local): + """ + Fit a local moran statistic for y after regressing out the + effects of confounding X on y. A "stronger" version of the + Partial_Moran statistic, as defined by :cite:`wolf2024confounded` + """ + + def __init__( + self, + permutations=999, + unit_scale=True, + transformer=None, + ): + """ + Initialize a local Moran statistic on the regression residuals + + + permutations : int (default: 999) + the number of permutations to run for the inference, + driven by conditional randomization. + unit_scale : bool (default: True) + whether or not to convert the input data to a unit normal scale. + transformer : callable (default: scikit regression) + should transform X into a predicted y. If not provided, will use + the standard scikit OLS regression of y on X. + """ + self.permutations = permutations + self.unit_scale = unit_scale + self.transformer = transformer + + def fit(self, X, y, W): + """ + Arguments + --------- + y : (N,1) array + array of data that is the targeted "outcome" covariate + to compute the multivariable Moran's I + X : (N,3) array + array of data that is used as "confounding factors" + to account for their covariance with Y. + W : (N,N) weights object + a PySAL weights object. Immediately row-standardized. + + Returns + ------- + A fitted Auxiliary_Moran_Local() estimator + """ + y = y - y.mean() + X = X - X.mean(axis=0) + if self.unit_scale: + y /= y.std() + X /= X.std(axis=0) + self.y = y + self.X = X + y_filtered_ = self.y_filtered_ = self._part_regress_transform(y, X) + if isinstance(W, Graph): + W = W.transform("R") + Wyf = W.lag(y_filtered_) + else: + W.transform = "r" + Wyf = lag_spatial(W, y_filtered_) # TODO: graph + self.connectivity = W + self.partials_ = np.column_stack((y_filtered_, Wyf)) + y_out = self.y_filtered_ + self.associations_ = ((y_out * Wyf) / (y_out.T @ y_out) * (W.n - 1)).flatten() + if self.permutations > 0: + self._crand() + # TODO: use the general p-value framework + p_sim = (self.reference_distribution_ < self.associations_[:, None]).mean( + axis=1 + ) + self.significances_ = np.minimum(p_sim, 1 - p_sim) + quads = np.array([[3,2,4,1]]).reshape(2,2) + left_component_cluster = (y_filtered_ > 0).astype(int) + right_component_cluster = (Wyf > 0).astype(int) + quads = quads[left_component_cluster, right_component_cluster] + self.labels_ = quads.squeeze() + return self + + def _part_regress_transform(self, y, X): + """If the object has a _transformer, use it; otherwise, fit it.""" + if hasattr(self, "_transformer"): + ypart = y - self._transformer(X) + else: + from sklearn.linear_model import LinearRegression + + self._transformer = LinearRegression().fit(X, y).predict + ypart = self._part_regress_transform(y, X) + return ypart + + def _crand(self): + """Cribbed from esda.Moran_Local + conditional randomization + for observation i with ni neighbors, the candidate set cannot include + i (we don't want i being a neighbor of i). we have to sample without + replacement from a set of ids that doesn't include i. numpy doesn't + directly support sampling wo replacement and it is expensive to + implement this. instead we omit i from the original ids, permute the + ids and take the first ni elements of the permuted ids as the + neighbors to i in each randomization. + """ + _, z = self.partials_.T + lisas = np.zeros((self.connectivity.n, self.permutations)) + n_1 = self.connectivity.n - 1 + prange = list(range(self.permutations)) + if isinstance(self.connectivity, Graph): + k = self.connectivity.cardinalities.max() + 1 + else: + k = self.connectivity.max_neighbors + 1 + nn = self.connectivity.n - 1 + rids = np.array([np.random.permutation(nn)[0:k] for i in prange]) + ids = np.arange(self.connectivity.n) + if hasattr(self.connectivity, "id_order"): + ido = self.connectivity.id_order + else: + ido = self.connectivity.unique_ids.values + w = [self.connectivity.weights[ido[i]] for i in ids] + wc = [self.connectivity.cardinalities[ido[i]] for i in ids] + + for i in tqdm(range(self.connectivity.n), desc="Simulating by site"): + idsi = ids[ids != i] + np.random.shuffle(idsi) + tmp = z[idsi[rids[:, 0 : wc[i]]]] + lisas[i] = z[i] * (w[i] * tmp).sum(1) + self.reference_distribution_ = (n_1 / (z * z).sum()) * lisas + + +Auxiliary_Moran_Local.__init__.__doc__ = Partial_Moran_Local.__init__.__doc__.replace( + "Partial", "Auxiliary" +) diff --git a/esda/tests/test_moran_local_mv.py b/esda/tests/test_moran_local_mv.py new file mode 100644 index 00000000..8ab130fe --- /dev/null +++ b/esda/tests/test_moran_local_mv.py @@ -0,0 +1,132 @@ +import numpy +import geodatasets +import geopandas +import pytest +from libpysal.weights import Queen +from libpysal.graph import Graph +from sklearn.linear_model import TheilSenRegressor +from esda.moran_local_mv import Partial_Moran_Local, Auxiliary_Moran_Local +from esda.moran import Moran_Local_BV + +def rsqueen(df): + w_classic = Queen.from_dataframe(df) + w_classic.transform = 'r' + return w_classic + +@pytest.fixture(scope='module') +def data(): + df = geopandas.read_file(geodatasets.get_path("geoda.lansing1")) + df = df[df.FIPS.str.match("2606500[01234]...") | (df.FIPS == "26065006500")] + y = df.HH_INC.values.reshape(-1,1) + X = df.HSG_VAL.values.reshape(-1,1) + yield y,X,df + +@pytest.fixture(scope='module', + params = [ + rsqueen, + lambda df: Graph.build_contiguity(df).transform('r') + ], + ids=['W', 'Graph'] + ) +def graph(data, request): + _,_,df = data + return request.param(df) + +def test_partial_runs(data, graph): + """Check if the class computes successfully in a default configuration""" + y,X,df = data + m = Partial_Moran_Local(permutations=1).fit(X,y,graph) + # done, just check if it runs + +def test_partial_accuracy(data, graph): + """Check if the class outputs expected results at a given seed""" + y,X,df = data + numpy.random.seed(111221) + m = Partial_Moran_Local(permutations=10).fit(X,y,graph) + # compute result by hand + zy = (y - y.mean())/y.std() + wz = (graph.sparse @ zy) + zx = (X - X.mean(axis=0))/X.std(axis=0) + rho = numpy.corrcoef(zy.squeeze(), zx.squeeze())[0,1] + left = zy - rho * zx + scale = (graph.n-1) / (graph.n * (1 - rho**2)) + # (y - rho x)*wy + manual = (left*wz).squeeze() * scale + # check values + numpy.testing.assert_allclose(manual, m.associations_) + + # check significances are about 18 + numpy.testing.assert_allclose((m.significances_ < .01).sum(), 18, atol=1) + numpy.testing.assert_equal((m.significances_[:5] < .1), [True, True, True, False, False]) + + # check quad + is_cluster = numpy.prod(m.partials_, axis=1) >= 0 + is_odd_label = m.labels_ % 2 + numpy.testing.assert_equal(is_cluster, is_odd_label) + +def test_partial_unscaled(data, graph): + """Check if the variance scaling behaves as expected""" + y,X,df = data + m = Partial_Moran_Local(permutations=0, unit_scale=True).fit(X,y,graph) + m2 = Partial_Moran_Local(permutations=0, unit_scale=False).fit(X,y,graph) + # variance in the partials_ should be different + s1y,s1x = m.partials_.std(axis=0) + s2y,s2x = m2.partials_.std(axis=0) + assert s1y > s2y, "variance is incorrectly scaled for y" + assert s1x < s2x, "variance is incorrectly scaled for x" + +def test_partial_uvquads(data, graph): + """Check that the quadrant decisions vary correctly with the inputs""" + y,X,df = data + m = Partial_Moran_Local(permutations=0, partial_labels=False).fit(X,y,graph) + bvx = Moran_Local_BV(X,y,graph,permutations=0) + numpy.testing.assert_array_equal(m.labels_, bvx.q) + +def test_aux_runs(data, graph): + """Check that the class completes successfully in a default configuration""" + y,X,df = data + a = Auxiliary_Moran_Local(permutations=1).fit(X,y,graph) + #done, just check if it runs + +def test_aux_accuracy(data, graph): + """Check that the class outputs expected values for a given seed""" + y,X,df = data + numpy.random.seed(111221) + a = Auxiliary_Moran_Local(permutations=10).fit(X,y,graph) + + # compute result by hand + zy = (y - y.mean())/y.std() + wz = (graph.sparse @ zy) + zx = (X - X.mean(axis=0))/X.std(axis=0) + wzx = graph.sparse @ zx + rho = numpy.corrcoef(zy.squeeze(), zx.squeeze())[0,1] + mean = zy * wz - rho * zx * wz - rho * zy * wzx + rho**2 * zx * wzx + scale = (graph.n-1) / (graph.n * (1 - rho**2)) + + manual = numpy.asarray(mean * scale).squeeze() + # check values, may not be identical because of the + # matrix inversion least squares estimator used in scikit + numpy.testing.assert_allclose(manual, a.associations_) + + # check significances + numpy.testing.assert_equal((a.significances_ < .01).sum(), 18) + numpy.testing.assert_equal((a.significances_[:5] < .1), [False, False, True, False, False]) + + is_cluster = numpy.prod(a.partials_, axis=1) >= 0 + is_odd_label = (a.labels_ % 2).astype(bool) + numpy.testing.assert_equal(is_cluster, is_odd_label) + +def test_aux_unscaled(data, graph): + """Check that the variance scaling behaves as expected""" + y,X,df = data + a = Auxiliary_Moran_Local(permutations=0, unit_scale=True).fit(X,y,graph) + a2 = Auxiliary_Moran_Local(permutations=0, unit_scale=False).fit(X,y,graph) + assert (a.partials_.std(axis=0) < a2.partials_.std(axis=0)).all(), ( + "variance is not scaled correctly in partial regression." + ) + +def test_aux_transformer(data, graph): + """Check that an alternative regressor can be used to calculate y|X""" + y,X, df = data + a = Auxiliary_Moran_Local(permutations=0, transformer=TheilSenRegressor).fit(X,y,graph) + # done, should just complete \ No newline at end of file diff --git a/notebooks/multivariable_moran.ipynb b/notebooks/multivariable_moran.ipynb new file mode 100644 index 00000000..620dc894 --- /dev/null +++ b/notebooks/multivariable_moran.ipynb @@ -0,0 +1,972 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Local Multi-Variable Moran Statistics\n", + "\n", + "Local Moran statistics are very useful to assess spatial clusters (or\n", + "outliers) in geographic data. Moran-style statistics are fundamentally\n", + "based on the *covariance* of an outcome $y_i$ with other observations\n", + "$y_j$, weighted according to some function that describes how near $i$\n", + "is to $j$. Classically, we describe that weight as $w_{ij}$, and collect\n", + "that into a big matrix, $\\mathbf{W}$, that describes the relations\n", + "between each site and every other site.\n", + "\n", + "The following discussion is a significantly condensed presentation of\n", + "that found in [Wolf\n", + "(2024)](https://doi.org/10.1080/24694452.2024.2326541) pages 1217-1225.\n", + "\n", + "The classical Moran statistic is stated as the relationship between $y$\n", + "and its surroudnings. I’m assuming that $y$ is unit standardized, so\n", + "that it has a mean of zero and a standard deviation of 1. Further, I’m\n", + "assuming that our weights matrix is row-standardized with a diagonal of\n", + "zero, meaning that $\\sum_j w_{ij} =1$ and $w_{ii}=0$. Further, this\n", + "means that $\\sum_j w_{ij}y_j$ corresponds to the *weighted average*\n", + "$y_j$ around $i$. With this understanding, the global $\\hat{I}$\n", + "estimator is often stated as:\n", + "\n", + "$$ \\hat{I} = \\frac{1}{n} \\sum_i y_i \\sum_j w_{ij}y_j $$\n", + "\n", + "You can understand this also as a kind of least squares estimator,\n", + "arising from the following regression:\n", + "\n", + "$$ \\mathbf{W}y = Iy + \\epsilon \\ \\ \\ \\ \\ \\epsilon \\sim \\mathcal{N}(0,\\sigma^2)$$\n", + "\n", + "In this framing, we can think of the $I$ statistic in vector form as:\n", + "\n", + "$$ \\hat{I} = (y'y)^{-1}y'\\mathbf{W}y $$\n", + "\n", + "With the standardization we’ve used above. The *local* version of this\n", + "statistic simply “stops” the outer summation over $i$:\n", + "\n", + "$$ \\hat{I}_i = \\frac{1}{n} y_i \\sum_j w_{ij} y_j $$\n", + "\n", + "This is equivalent to “stopping” the inner product between $y$ and\n", + "$\\mathbf{W}y$ in the vector form, turning it into an element-wise\n", + "product (spelled $\\circ$ in math below):\n", + "\n", + "$$ \\hat{I}_i = (y'y)^{-1}(y \\circ \\mathbf{W}y)$$\n", + "\n", + "This “incomplete summation” or “inner-to-elementwise trick” is how most\n", + "of the local statistics are obtained from a global measure of\n", + "covariance-based association. Any time you take an operation relating\n", + "all pairs of observations, and sum that (or average that) over all the\n", + "observations, you can create a “local” measure by just stopping that\n", + "summation.\n", + "\n", + "## How do we introduce another variables?\n", + "\n", + "Often, it’s useful to account for the spatial co-variation between two\n", + "variables. Indeed, a common question is whether the *spatial pattern* of\n", + "$y$ is similar to that of a second variable, $x$. Past attempts to link\n", + "two variables like this include the Wartenberg statistic:\n", + "\n", + "$$ \\hat{I}_{xy,i} = x_i \\sum_j w_{ij}y_j $$\n", + "$$ \\hat{\\mathbf{I}}_{xy} = x \\circ \\mathbf{W}y $$\n", + "\n", + "which relates $x_i$ to the local average of $y_j$ nearby. This is\n", + "useful, because it tells you whether a smoothed surface of $y$ looks\n", + "like the surface of $x$. Perhaps less useful is the Lee (2001)\n", + "innovation on the statistic, which seeks to compare the two smoothed\n", + "patterns:\n", + "\n", + "$$ \\hat{L}_i = (\\sum_j w_{ij}x_j) * (\\sum_j w_{ij}y_j) $$\n", + "$$ \\hat{\\mathbf{L}} = (\\mathbf{W}x) \\circ (\\mathbf{W}y) $$\n", + "\n", + "Both of these statistics are useful in their own ways, but generally are\n", + "not able to separate out $x$’s influence on $y$ from $y$’s internal\n", + "patterning over $\\mathbf{W}$. Instead, we’re stuck making pairwise\n", + "comparisons across $x$, $y$, and $\\mathbf{W}$.\n", + "\n", + "## How can we introduce another *exogenous* variable?\n", + "\n", + "Indeed, it’s often the case that $x$ represents some other factor we\n", + "know is associated with $y$, but cannot wholly explain $y$. When we just\n", + "need a global statistic characterizing this relationship, the best\n", + "solution is to use a spatial model, such as the spatial lag or spatial\n", + "error models in `spreg`. Creating a local statistic from these models is\n", + "difficult, however, since the non-linear matrix product cannot be cast\n", + "to an element-wise one. While we can examine the direct and indirect\n", + "components of $\\beta$, which measure the effect that a change in $x$ at\n", + "a given site has on *surrounding $y$*, this is not itself a measure of\n", + "the relationship within $y$.\n", + "\n", + "Instead, what we often want is to “remove” or “control” for $x$, and see\n", + "what the pattern is in this adjusted $y$. This aim is similar to that\n", + "found in work on [Restricted Spatial\n", + "Regression](https://www.tandfonline.com/doi/full/10.1080/01621459.2020.1788949),\n", + "but proceeds very differently (and thus avoids its counter-intuitive\n", + "effects). To illustrate, one of the common ways that we “remove” $x$\n", + "from $y$ is to simply regress $x$ out of $y$, and analyze its residuals.\n", + "That is, we do the first regression:\n", + "\n", + "$$ y = x\\beta + \\epsilon $$\n", + "\n", + "using a standard least squares estimator:\n", + "\n", + "$$ \\hat{\\beta} = (x'x)^{-1}x'y $$\n", + "\n", + "And use that to build a *residual* $y$, having removed its association\n", + "with $x$:\n", + "\n", + "$$ e = y - x (x'x)^{-1} x' y $$\n", + "\n", + "and then use *this* in a Moran-form regression:\n", + "\n", + "$$ \\mathbf{W}e = I_{x\\rightarrow y}e + \\nu $$\n", + "\n", + "Here, the search for structure in $e$ has already assumed that variation\n", + "in $y$ is fully “caused by” $x$ *first*. That is, the spatial structure\n", + "of $y$ that matters is that which is fully independent of $x$’s pattern.\n", + "Note that $e = (I-P)y$ in the RSR framing.\n", + "\n", + "If we estimate this again usting the same inner-to-elementwise trick, we\n", + "can obtain:\n", + "\n", + "$$ I_{x\\rightarrow y} = e \\circ \\mathbf{W}e (e'e)^{-1}$$\n", + "\n", + "Assuming that there is only one varible $x$, this can be re-stated in\n", + "terms of $y$ and $x$ with a little algebra:\n", + "\n", + "$$ I_{x\\rightarrow y} = \\left[ y\\circ \\mathbf{W}y - \\rho_{xy} x\\circ \\mathbf{W}y - \\rho_{xy} y\\circ\\mathbf{W}x + \\rho^2 x\\circ\\mathbf{W}x \\right] \\frac{n-1}{n} \\frac{1}{1-\\rho_{xy}^2}$$\n", + "\n", + "In this expression, you can see the Moran’s $I$ all the way on the left\n", + "of the bracketed expression, with two Wartenberg estimators in the\n", + "middle, and another Moran’s $I$ for $x$ on the end. Each term is also\n", + "scaled by $\\rho_{xy}$, depending on the directness of the relationship\n", + "with $y$. You can think of this as $I$, rescaled by a measure of\n", + "*indirect covariation*. Graphically, this is represented in the right\n", + "facet of the following image:\n", + "\n", + "
\n", + "\n", + "
Graph denoting the relations between some\n", + "variable x and the outcome of\n", + "interest y whose spatial\n", + "structure is of interest.
\n", + "
\n", + "\n", + "Since we’ve assumed that all of the variation in $y$ “comes from” $x$\n", + "first, we must consider all of the paths through *any* $x$ into $y$. You\n", + "can see that the correction terms that “flow through” $x$ must be\n", + "corrected by $\\rho_{xy}$ each time. So, for the $I_{x \\rightarrow y}$ on\n", + "the far right of the diagram, we have four possible paths between $y_1$\n", + "and $\\mathbf{W}y_1$:\n", + "\n", + "1. The most direct path is from $y_1$ to $\\mathbf{W}y_1$,\n", + "2. Another indirect path goes from $y_1$ through $x_i$, then to\n", + " $\\mathbf{W}y_1$.\n", + "3. Another indirect path goes from $y_i$, through $x$, then\n", + " $\\mathbf{W}x_1$, and then to $\\mathbf{W}y_1$.\n", + "\n", + "Hence, terms 2 and 3 are represented by the green path and the\n", + "red+orange path, while term 4 corresponds to the red+maroon path. Terms\n", + "2 and 3 are Wartenberg-style bivariate Moran statistics corrected by\n", + "$\\rho_{xy}$, while term 4 is a univariate Moran statistic for $x$,\n", + "corrected by $\\rho_{xy}^2$.\n", + "\n", + "But, what if we don’t want to assume that *all of* the variation in $y$\n", + "should be assigned to $x$ first? Then, we can use a Partial Moran\n", + "statistic, which examines the “direct” path from $y$ to $\\mathbf{W}y$\n", + "and corrects it by the portion of variation due to $x$’s association\n", + "with $y$, which drives a bit of the association between $y$ and\n", + "$\\mathbf{W}y$:\n", + "\n", + "$$ \\mathbf{\\hat{I}}_{y|x} = \\left[y\\circ \\mathbf{W}y - \\rho_{xy} x\\circ\\mathbf{W}y \\right] \\frac{N-1}{N} \\frac{1}{1 - \\rho^2_{xy}}$$\n", + "\n", + "So, these both correspond to very basic “corrections” of the univariate\n", + "$y$ using simple correlations, univariate Moran’s $I$, or bivariate\n", + "Moran’s $I$. However, the method can be extended to more covariates, and\n", + "its interpretation is easy to justify using some simple econometric\n", + "theory (see the end of this notebook).\n", + "\n", + "# Code time\n", + "\n", + "We’ll start with a few core packages. First, these new statistics are\n", + "implemented in the `moran_local_mv` module of the `esda` package. This\n", + "stands for “moran local multivariate” statistic. In addition, we’ll need\n", + "to compare these stats to the standard Univariate Moran (`Moran_Local`)\n", + "and bivariate/Wartenberg Moran (`Moran_Local_BV`) again from the `esda`\n", + "package. We’ll also want an easy way to calculate the correlation using\n", + "`scipy.stats.pearsonr`, a way to read in our data (`geopandas`,\n", + "`geodatasets`), a way to store the spatial relationships between\n", + "observations, provided by the `libpysal.graph` module, and a\n", + "visualization toolkit, `matplotlib.pyplot`." + ], + "id": "0c7d9a8f-9d6e-4a00-8af1-f4d8846145b9" + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "from esda.moran_local_mv import (\n", + " Partial_Moran_Local, Auxiliary_Moran_Local # new Moran\n", + ")\n", + "from esda.moran import Moran_Local, Moran_Local_BV # standard Moran\n", + "from libpysal import graph # construct spatial relations\n", + "from scipy.stats import pearsonr # calculate correlation\n", + "import geopandas, geodatasets # work with spatial data\n", + "from matplotlib import pyplot as plt # visualize\n", + "import seaborn # further visualization tools\n", + "import numpy # math tools" + ], + "id": "54111996" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We’ll work with Airbnb data from Chicago, in order to understand how\n", + "additional information $x$ informs our judgement about outcomes $y$. In\n", + "this case, we’ll consider the average price per head per night for\n", + "airbnbs in neighborhoods of Chicago, and how it is informed by the\n", + "overcrowding rate in those neighborhoods, which is a function itself of\n", + "the population density and investment in housing stock in the\n", + "neighborhood." + ], + "id": "b290e3d4-691a-4458-9362-e73ca829f90f" + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "df = geopandas.read_file(\n", + " geodatasets.get_path(\"geoda.airbnb\")\n", + ").dropna()" + ], + "id": "610fac5e" + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "output_type": "display_data", + "metadata": {}, + "data": { + "text/html": [ + "
Make this Notebook Trusted to load map: File -> Trust Notebook
" + ] + } + } + ], + "source": [ + "df.explore(\"price_pp\")" + ], + "id": "e382cbd7" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Im order to estimate the statistics, we need get our data. We’ll use the\n", + "`price_pp` to get the price per person per night for Airbnbs in each\n", + "neighborhood. And, we’ll use the `crowded` column for the percentages of\n", + "properties that are over-crowded. We’ll build a simple contiguity\n", + "weights matrix to explain the spatial adjacencies between neighborhoods." + ], + "id": "38775242-4e52-44ba-ab44-a6763cc614a7" + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "y = df.price_pp.values.reshape(-1,1)\n", + "x = df.crowded.values.reshape(-1,1)\n", + "w = graph.Graph.build_contiguity(df).transform(\"R\")" + ], + "id": "96cdd033" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now, with these defined, we can calculate all of our statistics. First,\n", + "we can note that each statistic corresponds to a different question of\n", + "relation, like we discussed above:" + ], + "id": "ad94cc3a-fad3-4b82-95fa-9066d56e74dc" + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "Simulating by site: 0%| | 0/66 [00:00" + ] + } + } + ], + "source": [], + "id": "c0ea28ef" + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": { + "jupyter": { + "source_hidden": true + } + }, + "outputs": [ + { + "output_type": "display_data", + "metadata": {}, + "data": { + "text/html": [ + "\n", + "" + ] + } + } + ], + "source": [], + "id": "10a261ad" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "From these, you can immediately see that the partial statistics can only\n", + "move laterally. No observations moved from coldspot to anything other\n", + "than “peak”. That is, $x$ only affects our judgement about $y$, not\n", + "$\\mathbf{W}y$. In the bottom, however, you can see that moves exist\n", + "across any quadrant, although no observations jump from being hotspots\n", + "to coldspots.\n", + "\n", + "# Conclusion\n", + "\n", + "In sum, the `Partial_Moran_Local` and `Auxiliary_Moran_Local` are great\n", + "ways to control for “exogenous” variables $x$ that you think might be\n", + "driving the pattern of your outcome. In this case, if you want to remove\n", + "*everything* due to $x$ first, you should use the\n", + "`Auxiliary_Moran_Local`, which is equivalent to regressing $x$ out of\n", + "$y$ and analyzing the residuals for structure. However, it is often the\n", + "case that we want to account for $x$ *without* assigning all of the\n", + "variation in $y$ to $x$ alone. In this case, use the\n", + "`Partial_Moran_Local` to recover an estimate of the relationship between\n", + "a site’s $y$ value and its surrounding $y$ values while fixing $x$ to a\n", + "constant value.\n", + "\n", + "# A bit more formal aside: Fritsch-Waugh-Lovell to the rescue\n", + "\n", + "For the econometricians in the room, there is an even better grounding\n", + "than the graph-based argument from above to use the partial coefficient.\n", + "If $\\mathbf{X}$ is a collection of exogenous control matrices, we\n", + "generally want to estimate $\\rho$ and its partial products/local effects\n", + "from the following Moran-form regression:\n", + "\n", + "$$ \\mathbf{W}y = \\mathbf{X}\\beta + \\rho y + \\epsilon $$\n", + "\n", + "How can we “get rid of” $\\mathbf{X}\\beta$ in our estimator for $\\rho$?\n", + "Well, the\n", + "[Frisch-Waugh-Lovell](https://en.wikipedia.org/wiki/Frisch–Waugh–Lovell_theorem)\n", + "lets us filter the regression to get back to a “regular” Moran-form\n", + "regression with just $\\mathbf{W}y$ and $y$. To do this, let\n", + "$\\mathbf{P}=\\mathbf{I}-\\mathbf{X}'(\\mathbf{X}'\\mathbf{X})^{-1}\\mathbf{X}'$,\n", + "and use FWL to re-state the regression above into a regression about the\n", + "effect of $y$ on $\\mathbf{W}y$ that is independent of $x$:\n", + "\n", + "$$ \\mathbf{P}\\mathbf{W}y = I_{y|x} \\mathbf{P} y + \\epsilon $$\n", + "\n", + "If you calculate this out, we get our estimator above. For the auxiliary\n", + "regression strategy, you can think of $e$ as “what remains of $y$ that\n", + "is independent of $x$. So, we would state $I_{x \\rightarrow y}$ for our\n", + "auxiliary regression would seek to find the effect of”what remains of\n", + "$y$ that is independent of $x$” on the lag of “what remains of $y$ that\n", + "is independent of $x$.”\n", + "\n", + "$$ \\mathbf{W}\\mathbf{P}y = I_{x \\rightarrow y}\\mathbf{P}y + \\epsilon$$\n", + "\n", + "The two are not equivalent because $\\mathbf{WP}\\neq\\mathbf{PW}$ in\n", + "general, even for symmetric $\\mathbf{W}$ and $\\mathbf{P}$; in practice,\n", + "$\\mathbf{P}$ is always symmetric, but $\\mathbf{W}$ rarely is. Generally,\n", + "we want $\\mathbf{P}\\mathbf{W}y$ as our outcome, not\n", + "$\\mathbf{W}\\mathbf{P}y$." + ], + "id": "bf0b8f37-81ce-496f-a47a-1296628e1f17" + } + ], + "nbformat": 4, + "nbformat_minor": 5, + "metadata": { + "kernelspec": { + "name": "analysis", + "display_name": "Analysis", + "language": "python" + }, + "language_info": { + "name": "python", + "codemirror_mode": { + "name": "ipython", + "version": "3" + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.0" + } + } +} \ No newline at end of file