Skip to content

Add functions to compute, plot, store the local hazard exceedence intensity and RP maps (new branch) #898

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 17 commits into from
Jul 15, 2024
Merged
Show file tree
Hide file tree
Changes from 10 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
98 changes: 98 additions & 0 deletions climada/hazard/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
from typing import Optional,List
import warnings

import geopandas as gpd
import numpy as np
from pathos.pools import ProcessPool as Pool
from scipy import sparse
Expand All @@ -39,6 +40,7 @@
import climada.util.constants as u_const
import climada.util.coordinates as u_coord
import climada.util.dates_times as u_dt
from climada.util.plot import GdfMeta


LOGGER = logging.getLogger(__name__)
Expand Down Expand Up @@ -450,6 +452,60 @@ def sanitize_event_ids(self):
LOGGER.debug('Resetting event_id.')
self.event_id = np.arange(1, self.event_id.size + 1)

def local_return_period(self, threshold_intensities):
"""Compute local return periods for given hazard intensities.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It would be good to have a clear explanation here of what this method is doing. How are the local return periods computed? Interpolation? Fitting?


Parameters
----------
threshold_intensities : np.array
Hazard intensities to consider.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I find the name threshold_intensities rather confusing, and the docstring does not really help. Can you be a bit more verbatim please?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

what about "test_intensities"? I agree, I'll update the docstrings

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

hmmm I think the word test would be quite confusing actually. Maybe just intensities? Or even avoid the plural and use intensity? Or keep threshold_intensities but with a clearer docstring.


Returns
-------
gdf : gpd.GeoDataFrame
GeoDataFrame containing return periods for given threshold intensity
description of units of values and threshold intensities in gdf.columns.name
"""
#check frequency unit
if self.frequency_unit not in ['1/year', 'annual', '1/y', '1/a']:
LOGGER.warning("The Hazard's frequency unit is %s and not %s which "
"most likely leads to incorrect results",
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

most likely leads to incorrect results does not tell me much. What is meant by that? Why would having different units of frequencies lead to incorrect results? What one gets is simply return periods in the unit of the frequency.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I agree, I could just use the inverse of the frequency attribute. I now added using a if elif statement that can read years, months, and weeks. I don't know how to make this more generic and easier because the frequency attribute is just a string and not a "physical" frequency unit that can be converted eg to years.

self.frequency_unit, u_const.DEF_FREQ_UNIT)

# Ensure threshold_intensities is a numpy array
threshold_intensities = np.array(threshold_intensities)

num_cen = self.intensity.shape[1]
return_periods = np.zeros((len(threshold_intensities), num_cen))

# Process each centroid in chunks as in local_exceedance_inten
block_size = CONFIG.max_matrix_size.int() // self.intensity.shape[0]
if not block_size:
raise ValueError('Increase max_matrix_size configuration parameter to >'
f'{self.intensity.shape[0]}')

for start_col in range(0, num_cen, block_size):
end_col = min(start_col + block_size, num_cen)
return_periods[:, start_col:end_col] = self._loc_return_period(
threshold_intensities,
self.intensity[:, start_col:end_col].toarray()
)

# create the output GeoDataFrame
gdf = gpd.GeoDataFrame(geometry = self.centroids.gdf['geometry'], crs = self.centroids.gdf.crs)
col_names = [f'{tresh_inten}' for tresh_inten in threshold_intensities]
gdf[col_names] = return_periods.T

#create gdf meta data
gdf_meta = GdfMeta(
name = 'Return Period',
unit = 'Years',
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why not use the units of the frequency (or rather its inverse)?

col_name = 'Threshold Intensity',
col_unit = self.units
)

return gdf, gdf_meta

def get_event_id(self, event_name):
"""Get an event id from its name. Several events might have the same
name.
Expand Down Expand Up @@ -613,6 +669,48 @@ def _loc_return_inten(self, return_periods, inten, exc_inten):
exc_inten[:, cen_idx] = self._cen_return_inten(
inten_sort[:, cen_idx], freq_sort[:, cen_idx],
self.intensity_thres, return_periods)

def _loc_return_period(self, threshold_intensities, inten):
"""Compute local return periods for given hazard intensities for a specific chunk of data.

Parameters
----------
threshold_intensities: np.array
Given hazard intensities for which to calculate return periods.
inten: np.array
The intensity array for a specific chunk of data.

Returns
-------
np.array
"""
# Assuming inten is sorted and calculating cumulative frequency
sort_pos = np.argsort(inten, axis=0)[::-1, :]
inten_sort = inten[sort_pos, np.arange(inten.shape[1])]
freq_sort = self.frequency[sort_pos]
freq_sort = np.cumsum(freq_sort, axis=0)
return_periods = np.zeros((len(threshold_intensities), inten.shape[1]))

for cen_idx in range(inten.shape[1]):
sorted_inten_cen = inten_sort[:, cen_idx]
cum_freq_cen = freq_sort[:, cen_idx]

for i, intensity in enumerate(threshold_intensities):
# Find the first occurrence where the intensity is less than the sorted intensities
exceedance_index = np.searchsorted(sorted_inten_cen[::-1], intensity, side='left')

# Calculate exceedance probability
if exceedance_index < len(cum_freq_cen):
exceedance_probability = cum_freq_cen[-exceedance_index - 1]
else:
exceedance_probability = 0 # Or set a default minimal probability

# Calculate and store return period
if exceedance_probability > 0:
return_periods[i, cen_idx] = 1 / exceedance_probability
else:
return_periods[i, cen_idx] = np.nan
return return_periods

def _check_events(self):
"""Check that all attributes but centroids contain consistent data.
Expand Down
1 change: 1 addition & 0 deletions climada/hazard/io.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
from typing import Union, Optional, Callable, Dict, Any

import h5py
import netCDF4 as nc
import numpy as np
import pandas as pd
import rasterio
Expand Down
33 changes: 33 additions & 0 deletions climada/hazard/plot.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,11 @@
Define Hazard Plotting Methods.
"""

import geopandas as gpd
import numpy as np
import matplotlib.pyplot as plt

import climada.util.coordinates as u_coord
import climada.util.plot as u_plot


Expand Down Expand Up @@ -65,6 +67,37 @@ def plot_rp_intensity(self, return_periods=(25, 50, 100, 250),
colbar_name, title, smooth=smooth, axes=axis,
figsize=figsize, adapt_fontsize=adapt_fontsize, **kwargs)
return axis, inten_stats

def plot_local_rp(self, threshold_intensities, smooth=True, axis=None, figsize=(9, 13), adapt_fontsize=True, cmap = 'viridis_r', **kwargs):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Formatting.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So, if a user wants to adapt the say color map, they must recompute the return periods every time? This seems rather inefficient. Would it not make sense to be able to pass the return periods to this function? Something like

plot_local_rp(self, threshold_intensities, return_periods=None, smooth=True, axis=None, figsize=(9, 13), adapt_fontsize=True, cmap = 'viridis_r', **kwargs):

Then, if return_periods is none, compute them. Else, use the return periods to make the graph. Maybe throw a warning if the threshold_itnensities are not in the gdf.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

sorry, I forgot to remove this function. The functionality is now in subplots_from_gdf

"""Plot hazard local return periods for given hazard intensities.

Parameters
----------
threshold_intensities: np.array
Hazard intensities to consider for calculating return periods.
smooth: bool, optional
Smooth plot to plot.RESOLUTION x plot.RESOLUTION.
axis: matplotlib.axes._subplots.AxesSubplot, optional
Axis to use.
figsize: tuple, optional
Figure size for plt.subplots.
kwargs: optional
Arguments for pcolormesh matplotlib function used in event plots.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Some parameters are missing.


Returns
-------
axis: matplotlib.axes._subplots.AxesSubplot
Matplotlib axis with the plot.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Incomplete.

"""
return_periods = self.local_return_period(threshold_intensities)
colbar_name = 'Return Period (years)'
title = list()
for haz_int in threshold_intensities:
title.append('Intensity: ' + f'{haz_int} {self.units}')
axis = u_plot.geo_im_from_array(return_periods, self.centroids.coord,
colbar_name, title, smooth=smooth, axes=axis,
figsize=figsize, adapt_fontsize=adapt_fontsize, cmap=cmap, **kwargs)
return axis, return_periods

def plot_intensity(self, event=None, centr=None, smooth=True, axis=None, adapt_fontsize=True,
**kwargs):
Expand Down
21 changes: 21 additions & 0 deletions climada/hazard/test/test_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -1022,6 +1022,27 @@ def test_ref_all_pass(self):
self.assertAlmostEqual(inten_stats[1][66], 70.608592953031405)
self.assertAlmostEqual(inten_stats[3][33], 88.510983305123631)
self.assertAlmostEqual(inten_stats[2][99], 79.717518054203623)

def test_local_return_period(self):
"""Compare local return periods against reference."""
haz = dummy_hazard()
haz.intensity = sparse.csr_matrix([
[1., 5., 1.],
[2., 2., 0.],
[3., 2., 1.],
[3., 2., 1.]
])
haz.frequency = np.full(4, 1.)
threshold_intensities = np.array([1., 2., 4.])
return_stats, _ = haz.local_return_period(threshold_intensities)
np.testing.assert_allclose(
return_stats[return_stats.columns[1:]].values.T,
np.array([
[0.25, 0.25, 0.33333333],
[0.33333333, 0.25, np.nan],
[np.nan, 1., np.nan]
])
)


class TestYearset(unittest.TestCase):
Expand Down
4 changes: 4 additions & 0 deletions climada/hazard/test/test_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,10 @@
import unittest
from unittest.mock import patch
import datetime as dt
import os
from pathlib import Path
from tempfile import TemporaryDirectory
import rasterio

from pyproj import CRS
import numpy as np
Expand All @@ -32,7 +34,9 @@
from climada.hazard.base import Hazard
from climada.util.constants import DEF_FREQ_UNIT, HAZ_TEMPLATE_XLS, HAZ_DEMO_FL, DEF_CRS
from climada.hazard.test.test_base import DATA_DIR, dummy_hazard
from climada.test import get_test_file

HAZ_TEST_TC :Path = get_test_file('test_tc_florida')
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this is not part of this PR.


class TestReadDefaultNetCDF(unittest.TestCase):
"""Test reading a NetCDF file where the coordinates to read match the dimensions"""
Expand Down
8 changes: 8 additions & 0 deletions climada/test/test_plot.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,14 @@ def test_hazard_rp_intensity(self):
(axis1, axis2), _ = hazard.plot_rp_intensity([25, 50])
self.assertEqual('Return period: 25 years', axis1.get_title())
self.assertEqual('Return period: 50 years', axis2.get_title())

# def test_plot_local_rp(self):
# """"Plot local return period maps for different hazard intensities"""
# hazard = Hazard.from_hdf5(HAZ_TEST_TC)
# (axis1, axis2), return_stats = hazard.plot_local_rp([10., 20.])
# self.assertEqual('Intensity: 10.0 m/s', axis1.get_title())
# self.assertEqual('Intensity: 20.0 m/s', axis2.get_title())
# np.testing.assert_array_equal(return_stats.shape, (2, 100))

def test_exposures_value_pass(self):
"""Plot exposures values."""
Expand Down
72 changes: 72 additions & 0 deletions climada/util/plot.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,8 @@
from cartopy.mpl.gridliner import LONGITUDE_FORMATTER, LATITUDE_FORMATTER
from rasterio.crs import CRS
import requests
from geopandas import GeoDataFrame
from collections import namedtuple

from climada.util.constants import CMAP_EXPOSURES, CMAP_CAT, CMAP_RASTER
from climada.util.files_handler import to_list
Expand Down Expand Up @@ -874,3 +876,73 @@ def multibar_plot(ax, data, colors=None, total_width=0.8, single_width=1,
# Draw legend if we need
if legend:
ax.legend(bars, data.keys())

# Create GdfMeta class (as a named tuple) for GeoDataFrame meta data
GdfMeta = namedtuple('GdfMeta', ['name', 'unit', 'col_name', 'col_unit'])
Copy link
Member

@chahank chahank Jul 11, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

hmmm I am not sure whether this is a good idea. Maybe it is, maybe not.

@emanuel-schmid What do you think? Should we add some nametuple in the utility plot for a specific case of geopandas plotting?

Copy link
Member

@chahank chahank Jul 11, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

To add more details: this is called very generically GdfMeta, but refers to only one very specific data set of geodataframes.

col_name is already in the dataframe, col_unit and unit are rather confusing, and name is a bit unclear to me too.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I suggested this to pass information to the plot that is not explicitly contained in the GeoDataFrame. GdfMeta can be renamed, of course. But I think a namedtuple makes more sense than a dict, because you can be sure which keys/attributes are defined. col_name specifies the name of the column to plot. But I agree that the names can be adjusted and made clearer.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I like the idea of passing some meta data for the plotting. I am not sure that this is the perfect way this way :D At first I thought that GdfMeta some new property of GeoDataFrames. Also, is the current proposal general enough for a generic geopandas plotting method in CLIMADA? Or should it then be made more specific.

And, namedtuple makes sense indeed.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@ValentinGebhart, could we maybe find another way to put this information into the GeoDataFrame? @chahank and I had the following ideas when discussing this in person:

  • The col_name to plot can be a separate parameter, as users generally have to state which functions to plot in GeoDataFrame.plot() and similar functions
  • The col_unit can be added to the column name in parentheses or brackets. The plot function then reads this information is brackets/parentheses are found, otherwise it does not display a unit.

Copy link
Collaborator Author

@ValentinGebhart ValentinGebhart Jul 11, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I agree that the naming is not optimal. The included information should be: what quantity (+ units) is represented by the values in the gdf, what quantity (+ units) is represented by the values in the column labels. The rows always correspond to centroids. Maybe instead of name we could use quantity or value_type?

Otherwise, the plotting function should be "generic", in the sense that it produces one subplot per column of a generic gdf, using the values to color the different points, and using the gdf_meta varibale for figure and axis label. Of course, it is still using the geo_im_from_array util function, and I'm sure there are other ways to plot a gdf with several columns.
Let me know what you think

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@peanutfun sorry I didn't see your suggestion in time. We can definitely add the labels and units as parameter(s) to the plot function. Then the user has to know about what their gdf represents without us helping by providing the information, which I guess is ok but is a bit more work to the user. If we put the as defaults to the plot function, the plot function is not generic anymore.

Copy link
Collaborator

@emanuel-schmid emanuel-schmid Jul 11, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Basically along the same lines as @peanutfun
I'd suggest to have two parameters instead of gdf_meta: GdfMeta,
colbar_name: str and title_subplots: lambda.

To get the same result as implemented now when calling

subplots_from_gdf(
    gdf=gdf, 
    gdf_meta=GdfMeta(name='Return Period', unit='Years', col_name='Threshold Intensity', col_unit='m/s')
)

you would

subplots_from_gdf(
    gdf=gdf, 
    colbar_name="Return Period (Years)", 
    title_subplots=lambda column: f"Threshold Intensity: {column} m/s",
)

Seems better readable and more flexible

Copy link
Collaborator

@emanuel-schmid emanuel-schmid Jul 11, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

consequently, I'd let the local_return_period return a triple

Returns
-------
gdf : GeoDataFrame
label : str
    GeoDataFrame label, for reporting and plotting
column_label : lambda
    Column label, for reporting and plotting

in case the user is not 100% sure what they did and is happy to get this kind of conformation. (Why not?)


def subplots_from_gdf(gdf: GeoDataFrame, gdf_meta: GdfMeta = None, smooth=True, axis=None, figsize=(9, 13), adapt_fontsize=True, **kwargs):
"""Plot hazard local return periods for given hazard intensities.

Parameters
----------
gdf: gpd.GeoDataFrame
return periods per threshold intensity
gdf_meta:
climada.util.plot.GdfMeta
gdf meta data in named tuple with attributes 'name' (quantity in gdf), 'unit', (unit thereof)
'col_name' (quantity in column labels), 'col_unit' (unit thereof)
smooth: bool, optional
Smooth plot to plot.RESOLUTION x plot.RESOLUTION. Default is True
axis: matplotlib.axes._subplots.AxesSubplot, optional
Axis to use. Default is None
figsize: tuple, optional
Figure size for plt.subplots. Default is (9, 13)
adapt_fontsize: bool, optional
If set to true, the size of the fonts will be adapted to the size of the figure.
Otherwise the default matplotlib font size is used. Default is True.
kwargs: optional
Arguments for pcolormesh matplotlib function used in event plots.

Returns
-------
axis: matplotlib.axes._subplots.AxesSubplot
Matplotlib axis with the plot.
"""
# check if inputs are correct types
if not isinstance(gdf, GeoDataFrame):
raise ValueError("gdf is not a GeoDataFrame")
gdf = gdf[['geometry', *[col for col in gdf.columns if col != 'geometry']]]

# read meta data for fig and axis labels
if not isinstance(gdf_meta, GdfMeta):
#warnings.warn("gdf_meta variable is not of type climada.util.plot.GdfMeta. Figure and axis labels will be missing.")
print("gdf_meta variable is not of type climada.util.plot.GdfMeta. Figure and axis labels will be missing.")
colbar_name, title_subplots = None, [f"{col}" for col in gdf.columns[1:]]
else:
colbar_name = f"{gdf_meta.name} ({gdf_meta.unit})"
title_subplots = [f"{gdf_meta.col_name}: {thres_inten} {gdf_meta.col_unit}"
for thres_inten in gdf.columns[1:]]

# change default plot kwargs if plotting return periods
if gdf_meta.name == 'Return Period':
if 'cmap' not in kwargs.keys():
kwargs.update({'cmap': 'viridis_r'})
if 'norm' not in kwargs.keys():
kwargs.update(
{'norm': mpl.colors.LogNorm(vmin=gdf.values[:,1:].min(), vmax=gdf.values[:,1:].max()),
'vmin': None, 'vmax': None}
)

axis = geo_im_from_array(
gdf.values[:,1:].T,
gdf.geometry.get_coordinates().values[:,::-1],
colbar_name,
title_subplots,
smooth=smooth,
axes=axis,
figsize=figsize,
adapt_fontsize=adapt_fontsize,
**kwargs
)

return axis
18 changes: 18 additions & 0 deletions climada/util/test/test_plot.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@
import matplotlib.pyplot as plt
from matplotlib import colormaps as cm
import cartopy.crs as ccrs
from geopandas import GeoDataFrame
from shapely import Point

import climada.util.plot as u_plot

Expand Down Expand Up @@ -150,6 +152,22 @@ def test_geo_im_from_array(self):
self.assertEqual(cmap, ax.collections[0].cmap.name)
plt.close()

def test_subplots_from_gdf(self):
return_periods = GeoDataFrame(
data = ((2., 5.), (3., 6.), (None, 2.), (1., 7.)),
columns = ('10.0', '20.0')
)
return_periods['geometry'] = (Point(45., 26.), Point(46., 26.), Point(45., 27.), Point(46., 27.))
gdf_meta = u_plot.GdfMeta(
name = 'Return Period',
unit = 'Years',
col_name = 'Threshold Intensity',
col_unit = 'm/s'
)
(axis1, axis2) = u_plot.subplots_from_gdf(return_periods, gdf_meta)
self.assertEqual('Threshold Intensity: 10.0 m/s', axis1.get_title())
self.assertEqual('Threshold Intensity: 20.0 m/s', axis2.get_title())
plt.close()

# Execute Tests
if __name__ == "__main__":
Expand Down
287 changes: 196 additions & 91 deletions doc/tutorial/climada_hazard_Hazard.ipynb

Large diffs are not rendered by default.

Loading