Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
111 changes: 75 additions & 36 deletions libpysal/graph/_spatial_lag.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,8 @@ def _lag_spatial(graph, y, categorical=False, ties="raise"):
graph : Graph
libpysal.graph.Graph
y : array
numpy array with dimensionality conforming to w
numpy array with dimensionality conforming to w. Can be 2D if all
columns are numerical.
categorical : bool
True if y is categorical, False if y is continuous.
ties : {'raise', 'random', 'tryself'}, optional
Expand Down Expand Up @@ -83,49 +84,87 @@ def _lag_spatial(graph, y, categorical=False, ties="raise"):
if isinstance(y, list):
y = np.array(y)

if (
isinstance(y.dtype, pd.CategoricalDtype)
if y.ndim == 1 and (
categorical
or isinstance(y.dtype, pd.CategoricalDtype)
or pd.api.types.is_object_dtype(y.dtype)
or pd.api.types.is_bool_dtype(y.dtype)
or pd.api.types.is_string_dtype(y.dtype)
):
categorical = True
if categorical:
if isinstance(y, np.ndarray):
y = pd.Series(y, index=graph.unique_ids)

df = pd.DataFrame(data=graph.adjacency)
df["neighbor_label"] = y.loc[graph.adjacency.index.get_level_values(1)].values
df["own_label"] = y.loc[graph.adjacency.index.get_level_values(0)].values
df["neighbor_idx"] = df.index.get_level_values(1)
df["focal_idx"] = df.index.get_level_values(0)
gb = df.groupby(["focal", "neighbor_label"]).count().groupby(level="focal")
n_ties = gb.apply(_check_ties).sum()
if n_ties and ties == "raise":
raise ValueError(
f"There are {n_ties} ties that must be broken "
f"to define the categorical "
"spatial lag for these observations. To address this "
"issue, consider setting `ties='tryself'` "
"or `ties='random'` or consult the documentation "
"about ties and the categorical spatial lag."
)
# either there are ties and random|tryself specified or
# there are no ties
gb = df.groupby(by=["focal"])
if ties == "random" or ties == "raise":
return gb.apply(_get_categorical_lag).values
elif ties == "tryself" or ties == "raise":
return gb.apply(_get_categorical_lag, ties="tryself").values
else:
raise ValueError(
f"Received option ties='{ties}', but only options "
"'raise','random','tryself' are supported."
)
return _categorical(graph, y, ties=ties)

return sp @ y


def _categorical(graph, y, ties):
"""
Compute the categorical spatial lag for each observation in a graph.

Parameters
----------
graph : object
y : array-like (numpy.ndarray or pandas.Series)
Categorical labels for each observation.
ties : {'raise', 'random', 'tryself'}
How to handle ties when multiple neighbor categories are equally frequent:
- 'raise' : raise a ValueError if any tie exists.
- 'random': break ties uniformly at random.
- 'tryself': if the focal unit's own label is among the tied labels,
choose the focal label; otherwise break ties (deterministic
choice defined by helper routine).

Returns
-------
numpy.ndarray
An array of categorical spatial lag values aligned with graph.unique_ids.

Raises
------
ValueError
- If ties are present and ties == 'raise'.
- If ties is not one of 'raise', 'random', or 'tryself'.

Notes
-----
The implementation groups adjacency entries by focal unit and counts neighbor
labels to determine the modal category per focal. Tie detection and
resolution are delegated to the helper functions _check_ties and
_get_categorical_lag. Using 'random' produces nondeterministic outputs unless
a random seed is fixed externally.
"""
if isinstance(y, np.ndarray):
y = pd.Series(y, index=graph.unique_ids)

df = pd.DataFrame(data=graph.adjacency)
df["neighbor_label"] = y.loc[graph.adjacency.index.get_level_values(1)].values
df["own_label"] = y.loc[graph.adjacency.index.get_level_values(0)].values
df["neighbor_idx"] = df.index.get_level_values(1)
df["focal_idx"] = df.index.get_level_values(0)
gb = df.groupby(["focal", "neighbor_label"]).count().groupby(level="focal")
n_ties = gb.apply(_check_ties).sum()
if n_ties and ties == "raise":
raise ValueError(
f"There are {n_ties} ties that must be broken "
f"to define the categorical "
"spatial lag for these observations. To address this "
"issue, consider setting `ties='tryself'` "
"or `ties='random'` or consult the documentation "
"about ties and the categorical spatial lag."
)
# either there are ties and random|tryself specified or
# there are no ties
gb = df.groupby(by=["focal"])
if ties == "random" or ties == "raise":
return gb.apply(_get_categorical_lag).values
elif ties == "tryself" or ties == "raise":
return gb.apply(_get_categorical_lag, ties="tryself").values
else:
raise ValueError(
f"Received option ties='{ties}', but only options "
"'raise','random','tryself' are supported."
)


def _check_ties(focal):
"""Reduction to determine if a focal unit has multiple modes for neighbor labels.

Expand Down
10 changes: 6 additions & 4 deletions libpysal/graph/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -2035,18 +2035,20 @@ def higher_order(self, k=2, shortest_path=True, diagonal=False, lower_order=Fals

return higher

def lag(self, y, categorical=False, ties="raise"):
def lag(self, y, categorical=None, ties="raise"):
"""Spatial lag operator

Constructs spatial lag based on neighbor relations of the graph.


Parameters
----------
y : array
numpy array with dimensionality conforming to w
y : array_like
Array-like aligned with the graph. Can be 2-dimensional if
all columns are numerical.
categorical : bool
True if y is categorical, False if y is continuous.
True if y is categorical, False if y is continuous. If None, it is
derived from the dtype of ``y``.
ties : {'raise', 'random', 'tryself'}, optional
Policy on how to break ties when a focal unit has multiple
modes for a categorical lag.
Expand Down
24 changes: 24 additions & 0 deletions libpysal/graph/tests/test_spatial_lag.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,3 +71,27 @@ def test_categorical_custom_index(self):
np.testing.assert_array_equal(
expected, self.g.lag(["foo", "bar", "foo", "foo"])
)

def test_2d_array(self):
ys = np.arange(27).reshape(9, 3)
lag = self.gc.lag(ys)

expected = np.array(
[
[6.0, 7.0, 8.0],
[6.0, 7.0, 8.0],
[9.0, 10.0, 11.0],
[10.0, 11.0, 12.0],
[12.0, 13.0, 14.0],
[14.0, 15.0, 16.0],
[15.0, 16.0, 17.0],
[18.0, 19.0, 20.0],
[18.0, 19.0, 20.0],
]
)

np.testing.assert_array_almost_equal(lag, expected)

# test equality to 1d
for i in range(2):
np.testing.assert_array_equal(self.gc.lag(ys[:, i]), lag[:, i])