From 65adefef07737aaf931be61ab7ddf2971a8cd4af Mon Sep 17 00:00:00 2001 From: veenstrajelmer <60435591+veenstrajelmer@users.noreply.github.com> Date: Mon, 24 Jun 2024 16:07:46 +0200 Subject: [PATCH] 97 improve docstrings for public functions (#98) * docstrings for data_retrieve.py * add docstrings to data_analysis.py * added docstrings for tidalindicators.py * added docstrings for havengetallen.py --- kenmerkendewaarden/data_analysis.py | 83 +++++++++++++++++- kenmerkendewaarden/data_retrieve.py | 90 +++++++++++++++++++- kenmerkendewaarden/havengetallen.py | 42 +++++++++- kenmerkendewaarden/overschrijding.py | 13 +-- kenmerkendewaarden/tidalindicators.py | 116 ++++++++++++++------------ tests/test_tidalindicators.py | 6 +- 6 files changed, 277 insertions(+), 73 deletions(-) diff --git a/kenmerkendewaarden/data_analysis.py b/kenmerkendewaarden/data_analysis.py index 87c17ac..8bf7042 100644 --- a/kenmerkendewaarden/data_analysis.py +++ b/kenmerkendewaarden/data_analysis.py @@ -22,7 +22,26 @@ logger = logging.getLogger(__name__) -def plot_measurements_amount(df, relative=False): +def plot_measurements_amount(df:pd.DataFrame, relative:bool = False): + """ + Read the measurements amount csv and generate a pcolormesh figure of all years and stations. + The colors indicate the absolute or relative number of measurements per year. + + Parameters + ---------- + df : pd.DataFrame + Dataframe with the amount of measurements for several years per station. + relative : bool, optional + Whether to scale the amount of measurements with the median of all measurement amounts for the same year. The default is False. + + Returns + ------- + fig : matplotlib.figure.Figure + Figure handle. + ax : matplotlib.axes._axes.Axes + Figure axis handle. + + """ df = df.copy() df[df==0] = np.nan @@ -45,7 +64,25 @@ def plot_measurements_amount(df, relative=False): return fig, ax -def plot_measurements(df, df_ext=None): +def plot_measurements(df:pd.DataFrame, df_ext:pd.DataFrame = None): + """ + Generate a timeseries figure for the measurement timeseries (and extremes) of this station. + + Parameters + ---------- + df : pd.DataFrame + Dataframe with the measurement timeseries for a particular station. + df_ext : pd.DataFrame, optional + Dataframe with the measurement extremes for a particular station. + + Returns + ------- + fig : matplotlib.figure.Figure + Figure handle. + ax : matplotlib.axes._axes.Axes + Figure axis handle. + + """ station_df = df.attrs["station"] if df_ext is not None: station_df_ext = df_ext.attrs["station"] @@ -83,7 +120,27 @@ def plot_measurements(df, df_ext=None): return fig, (ax1,ax2) -def plot_stations(station_list, crs=None, add_labels=False): +def plot_stations(station_list:list, crs:int = None, add_labels:bool = False): + """ + Plot the stations by subsetting a ddlpy catalog with the provided list of stations. + + Parameters + ---------- + station_list : list + List of stations to plot the locations from. + crs : int, optional + Coordinate reference system, for instance 28992. The coordinates retrieved from the DDL will be converted to this EPSG. The default is None. + add_labels : bool, optional + Whether to add station code labels in the figure, useful for debugging. The default is False. + + Returns + ------- + fig : matplotlib.figure.Figure + Figure handle. + ax : matplotlib.axes._axes.Axes + Figure axis handle. + + """ locs_meas_ts_all, locs_meas_ext_all, _ = retrieve_catalog(crs=crs) locs_ts = locs_meas_ts_all.loc[locs_meas_ts_all.index.isin(station_list)] locs_ext = locs_meas_ext_all.loc[locs_meas_ext_all.index.isin(station_list)] @@ -189,7 +246,25 @@ def get_stats_from_dataframe(df): return ds_stats -def derive_statistics(dir_output, station_list, extremes): +def derive_statistics(dir_output:str, station_list:list, extremes:bool): + """ + Derive several statistics for the measurements of each station in the list. + + Parameters + ---------- + dir_output : str + Path where the measurement netcdf file will be stored. + station : list + list of station names to derive statistics for, for instance ["HOEKVHLD"]. + extremes : bool + Whether to derive statistics from waterlevel timeseries or extremes. + + Returns + ------- + data_summary : pd.DataFrame + A dataframe with several statistics for each station from the provided list. + + """ row_list = [] for current_station in station_list: logger.info(f'deriving statistics for {current_station} (extremes={extremes})') diff --git a/kenmerkendewaarden/data_retrieve.py b/kenmerkendewaarden/data_retrieve.py index ebb7353..6666832 100644 --- a/kenmerkendewaarden/data_retrieve.py +++ b/kenmerkendewaarden/data_retrieve.py @@ -86,7 +86,28 @@ def check_locations_amount(locations): raise ValueError(f"multiple stations present after station subsetting:\n{locations}") -def retrieve_measurements_amount(dir_output, station_list, extremes:bool, start_date, end_date): +def retrieve_measurements_amount(dir_output:str, station_list:list, extremes:bool, start_date:pd.Timestamp, end_date:pd.Timestamp): + """ + Retrieve the amount of measurements or extremes for a single station from the DDL with ddlpy. + + Parameters + ---------- + dir_output : str + Path where the measurement netcdf file will be stored. + station : str + station name, for instance "HOEKVHLD". + extremes : bool + Whether to read measurements for waterlevel timeseries or extremes. + start_date : pd.Timestamp (or anything understood by pd.Timestamp) + start date of the measurements to be retrieved. + end_date : pd.Timestamp (or anything understood by pd.Timestamp) + end date of the measurements to be retrieved. + + Returns + ------- + None + + """ locs_meas_ts, locs_meas_ext, locs_meas_exttype = retrieve_catalog() if extremes: @@ -130,7 +151,23 @@ def retrieve_measurements_amount(dir_output, station_list, extremes:bool, start_ df_amount.to_csv(file_csv_amount) -def read_measurements_amount(dir_output, extremes:bool): +def read_measurements_amount(dir_output:str, extremes:bool): + """ + Read the measurements amount csv into a dataframe. + + Parameters + ---------- + dir_output : str + Path where the measurements are stored. + extremes : bool + Whether to read measurements amount for waterlevel timeseries or extremes. + + Returns + ------- + df_amount : pd.DataFrame + DataFrame with the amount of measurements per year. + + """ if extremes: fname = DICT_FNAMES['amount_ext'] else: @@ -146,7 +183,30 @@ def read_measurements_amount(dir_output, extremes:bool): return df_amount -def retrieve_measurements(dir_output:str, station:str, extremes:bool, start_date, end_date, drop_if_constant=None): +def retrieve_measurements(dir_output:str, station:str, extremes:bool, start_date:pd.Timestamp, end_date:pd.Timestamp, drop_if_constant:list = None): + """ + Retrieve timeseries with measurements or extremes for a single station from the DDL with ddlpy. + + Parameters + ---------- + dir_output : str + Path where the measurement netcdf file will be stored. + station : str + station name, for instance "HOEKVHLD". + extremes : bool + Whether to read measurements for waterlevel timeseries or extremes. + start_date : pd.Timestamp (or anything understood by pd.Timestamp) + start date of the measurements to be retrieved. + end_date : pd.Timestamp (or anything understood by pd.Timestamp) + end date of the measurements to be retrieved. + drop_if_constant : list, optional + A list of columns to drop if the row values are constant, to save disk space. The default is None. + + Returns + ------- + None + + """ locs_meas_ts, locs_meas_ext, locs_meas_exttype = retrieve_catalog() @@ -225,7 +285,29 @@ def xarray_to_hatyan(ds): return df -def read_measurements(dir_output:str, station:str, extremes:bool, return_xarray=False, nap_correction=False): +def read_measurements(dir_output:str, station:str, extremes:bool, return_xarray:bool = False, nap_correction:bool = False): + """ + Read the measurements netcdf as a dataframe. + + Parameters + ---------- + dir_output : str + Path where the measurements are stored. + station : str + station name, for instance "HOEKVHLD". + extremes : bool + Whether to read measurements for waterlevel timeseries or extremes. + return_xarray : bool, optional + Whether to return raw xarray.Dataset instead of a DataFrame. The default is False. + nap_correction : bool, optional + Whether to correct for NAP2005. The default is False. + + Returns + ------- + df_meas : pd.DataFrame + DataFrame with the measurements or extremes timeseries. + + """ if extremes: fname = DICT_FNAMES["meas_ext"].format(station=station) diff --git a/kenmerkendewaarden/havengetallen.py b/kenmerkendewaarden/havengetallen.py index f3e492b..e593a95 100644 --- a/kenmerkendewaarden/havengetallen.py +++ b/kenmerkendewaarden/havengetallen.py @@ -49,8 +49,8 @@ def calc_havengetallen(df_ext:pd.DataFrame, return_df_ext=False, min_coverage=No df_havengetallen : pd.DataFrame DataFrame with havengetallen for all hour-classes. 0 corresponds to spring, 6 corresponds to neap, mean is mean. - return_df_ext : pd.DataFrame - An enriched copy of the input DataFrame, mainly for plotting. + df_ext : pd.DataFrame + An enriched copy of the input DataFrame including a 'culm_hr' column. """ raise_extremes_with_aggers(df_ext) @@ -174,7 +174,25 @@ def calc_HWLW_culmhr_summary_tidalcoeff(df_ext): return HWLW_culmhr_summary -def plot_HWLW_pertimeclass(df_ext, df_havengetallen): +def plot_HWLW_pertimeclass(df_ext:pd.DataFrame, df_havengetallen:pd.DataFrame): + """ + Plot the extremes for each hour-class, including a median line. + + Parameters + ---------- + df_ext : pd.DataFrame + DataFrame with measurement extremes, as provided by `kw.calc_havengetallen()`. + df_havengetallen : pd.DataFrame + DataFrame with havengetallen for all hour-classes, as provided by `kw.calc_havengetallen()`. + + Returns + ------- + fig : matplotlib.figure.Figure + Figure handle. + ax : matplotlib.axes._axes.Axes + Figure axis handle. + + """ assert 'culm_hr' in df_ext.columns @@ -204,7 +222,23 @@ def plot_HWLW_pertimeclass(df_ext, df_havengetallen): return fig, axs -def plot_aardappelgrafiek(df_havengetallen): +def plot_aardappelgrafiek(df_havengetallen:pd.DataFrame): + """ + Plot the median values of each hour-class in a aardappelgrafiek. + + Parameters + ---------- + df_havengetallen : pd.DataFrame + DataFrame with havengetallen for all hour-classes, as provided by `kw.calc_havengetallen()`. + + Returns + ------- + fig : matplotlib.figure.Figure + Figure handle. + ax : matplotlib.axes._axes.Axes + Figure axis handle. + + """ # remove mean column HWLW_culmhr_summary = df_havengetallen.loc[:11].copy() diff --git a/kenmerkendewaarden/overschrijding.py b/kenmerkendewaarden/overschrijding.py index f9b5dbb..736de82 100644 --- a/kenmerkendewaarden/overschrijding.py +++ b/kenmerkendewaarden/overschrijding.py @@ -9,7 +9,6 @@ from matplotlib import ticker from scipy import optimize, signal from typing import Union, List -import datetime as dt import logging from kenmerkendewaarden.data_retrieve import clip_timeseries_physical_break from kenmerkendewaarden.utils import raise_extremes_with_aggers @@ -30,7 +29,7 @@ def get_threshold_rowidx(df): def calc_overschrijding(df_ext:pd.DataFrame, dist:dict = None, inverse:bool = False, clip_physical_break:bool = False, - rule_type:str = None, rule_value=None, + rule_type:str = None, rule_value:(pd.Timestamp, float) = None, interp_freqs:list = None): """ Compute exceedance/deceedance frequencies based on measured extreme waterlevels. @@ -47,8 +46,9 @@ def calc_overschrijding(df_ext:pd.DataFrame, dist:dict = None, Whether to exclude the part of the timeseries before physical breaks like estuary closures. The default is False. rule_type : str, optional break/linear/None, passed on to apply_trendanalysis(). The default is None. - rule_value : TYPE, optional - Value corresponding to rule_type. The default is None. + rule_value : (pd.Timestamp, float), optional + Value corresponding to rule_type, pd.Timestamp (or anything understood by pd.Timestamp) + in case of rule_type='break', float in case of rule_type='linear'. The default is None. interp_freqs : list, optional The frequencies to interpolate to, providing this will result in a "Geinterpoleerd" key in the returned dictionary. The default is None. @@ -321,7 +321,7 @@ def get_total_years(df: pd.DataFrame) -> float: return (df.index[-1] - df.index[0]).total_seconds() / (3600 * 24 * 365) -def apply_trendanalysis(df: pd.DataFrame, rule_type: str, rule_value: Union[float, dt.datetime]): +def apply_trendanalysis(df: pd.DataFrame, rule_type: str, rule_value: Union[pd.Timestamp, float]): # There are 2 rule types: - break -> Values before break are removed # - linear -> Values are increased/lowered based on value in value/year. It is assumes # that there is no linear trend at the latest time (so it works its way back @@ -329,7 +329,8 @@ def apply_trendanalysis(df: pd.DataFrame, rule_type: str, rule_value: Union[floa if rule_type == 'break': return df[rule_value:].copy() elif rule_type == 'linear': - df, rule_value = df.copy(), float(rule_value) + rule_value = float(rule_value) + df = df.copy() dx = np.array([rule_value*x.total_seconds()/(365*24*3600) for x in (df.index[-1] - df.index)]) df['values'] = df['values'] + dx return df diff --git a/kenmerkendewaarden/tidalindicators.py b/kenmerkendewaarden/tidalindicators.py index e1373f2..7aa6751 100644 --- a/kenmerkendewaarden/tidalindicators.py +++ b/kenmerkendewaarden/tidalindicators.py @@ -23,21 +23,21 @@ logger = logging.getLogger(__name__) -def calc_HWLWtidalindicators(df_ext, min_coverage:float = None): +def calc_HWLWtidalindicators(df_ext:pd.DataFrame, min_coverage:float = None): """ - computes several tidal extreme indicators from tidal extreme dataset + Computes several tidal extreme indicators from tidal extreme dataset. Parameters ---------- - data_pd_HWLW_all : TYPE - DESCRIPTION. + df_ext : pd.DataFrame + Dataframe with extremes timeseries. min_coverage : float, optional The minimum percentage (from 0 to 1) of timeseries coverage to consider the statistics to be valid. The default is None. Returns ------- - dict_tidalindicators : TYPE - DESCRIPTION. + dict_tidalindicators : dict + Dictionary with several tidal indicators like yearly/monthly means. """ # dropping the timezone makes the code below much faster and gives equal results: https://github.com/pandas-dev/pandas/issues/58956 @@ -93,57 +93,57 @@ def calc_HWLWtidalindicators(df_ext, min_coverage:float = None): HW_monthmin_mean_peryear = HW_monthmin_permonth.groupby(pd.PeriodIndex(HW_monthmin_permonth.index, freq="Y"))[['values']].mean() LW_monthmax_mean_peryear = LW_monthmax_permonth.groupby(pd.PeriodIndex(LW_monthmax_permonth.index, freq="Y"))[['values']].mean() - dict_HWLWtidalindicators = {'HW_mean':data_pd_HW['values'].mean(), #GHW - 'LW_mean':data_pd_LW['values'].mean(), #GLW - 'HW_mean_peryear':HW_mean_peryear['values'], #GHW peryear - 'LW_mean_peryear':LW_mean_peryear['values'], #GLW peryear - 'HW_monthmax_permonth':HW_monthmax_permonth['values'], #GHHW/GHWS permonth - 'LW_monthmin_permonth':LW_monthmin_permonth['values'], #GLLW/GLWS permonth - 'HW_monthmax_mean_peryear':HW_monthmax_mean_peryear['values'], #GHHW/GHWS peryear - 'LW_monthmin_mean_peryear':LW_monthmin_mean_peryear['values'], #GLLW/GLWS peryear - 'HW_monthmin_mean_peryear':HW_monthmin_mean_peryear['values'], #GLHW/GHWN peryear - 'LW_monthmax_mean_peryear':LW_monthmax_mean_peryear['values'], #GHLW/GLWN peryear - } - - return dict_HWLWtidalindicators - - -def calc_wltidalindicators(data_wl_pd, min_coverage:float = None): + dict_tidalindicators = {'HW_mean':data_pd_HW['values'].mean(), #GHW + 'LW_mean':data_pd_LW['values'].mean(), #GLW + 'HW_mean_peryear':HW_mean_peryear['values'], #GHW peryear + 'LW_mean_peryear':LW_mean_peryear['values'], #GLW peryear + 'HW_monthmax_permonth':HW_monthmax_permonth['values'], #GHHW/GHWS permonth + 'LW_monthmin_permonth':LW_monthmin_permonth['values'], #GLLW/GLWS permonth + 'HW_monthmax_mean_peryear':HW_monthmax_mean_peryear['values'], #GHHW/GHWS peryear + 'LW_monthmin_mean_peryear':LW_monthmin_mean_peryear['values'], #GLLW/GLWS peryear + 'HW_monthmin_mean_peryear':HW_monthmin_mean_peryear['values'], #GLHW/GHWN peryear + 'LW_monthmax_mean_peryear':LW_monthmax_mean_peryear['values'], #GHLW/GLWN peryear + } + + return dict_tidalindicators + + +def calc_wltidalindicators(df_meas:pd.DataFrame, min_coverage:float = None): """ - computes monthly and yearly means from waterlevel timeseries + Computes monthly and yearly means from waterlevel timeseries. Parameters ---------- - data_wl_pd : TYPE - DESCRIPTION. + df_meas : pd.DataFrame + Dataframe with waterlevel timeseries. min_coverage : float, optional The minimum percentage (from 0 to 1) of timeseries coverage to consider the statistics to be valid. The default is None. Returns ------- - dict_wltidalindicators : TYPE - DESCRIPTION. + dict_tidalindicators : dict + Dictionary with several tidal indicators like yearly/monthly means. """ # dropping the timezone makes the code below much faster and gives equal results: https://github.com/pandas-dev/pandas/issues/58956 - if data_wl_pd.index.tz is not None: - data_wl_pd = data_wl_pd.tz_localize(None) + if df_meas.index.tz is not None: + df_meas = df_meas.tz_localize(None) # yearmean wl from wl values - wl_mean_peryear = data_wl_pd.groupby(pd.PeriodIndex(data_wl_pd.index, freq="Y"))[['values']].mean() - wl_mean_permonth = data_wl_pd.groupby(pd.PeriodIndex(data_wl_pd.index, freq="M"))[['values']].mean() + wl_mean_peryear = df_meas.groupby(pd.PeriodIndex(df_meas.index, freq="Y"))[['values']].mean() + wl_mean_permonth = df_meas.groupby(pd.PeriodIndex(df_meas.index, freq="M"))[['values']].mean() # replace invalids with nan (in case of too less values per month or year) if min_coverage is not None: assert 0 <= min_coverage <= 1 # count timeseries values per year/month - wl_count_peryear = compute_actual_counts(data_wl_pd, freq="Y") - wl_count_permonth = compute_actual_counts(data_wl_pd, freq="M") + wl_count_peryear = compute_actual_counts(df_meas, freq="Y") + wl_count_permonth = compute_actual_counts(df_meas, freq="M") # compute expected counts and multiply with min_coverage to get minimal counts - min_count_peryear = compute_expected_counts(data_wl_pd, freq="Y") * min_coverage - min_count_permonth = compute_expected_counts(data_wl_pd, freq="M") * min_coverage + min_count_peryear = compute_expected_counts(df_meas, freq="Y") * min_coverage + min_count_permonth = compute_expected_counts(df_meas, freq="M") * min_coverage # set all statistics that were based on too little values to nan wl_mean_peryear.loc[wl_count_peryear tuple: @@ -288,7 +300,7 @@ def calc_hat_lat_fromcomponents(comp: pd.DataFrame) -> tuple: Parameters ---------- comp : pd.DataFrame - DESCRIPTION. + DataFrame with amplitudes and phases for a list of components. Returns ------- diff --git a/tests/test_tidalindicators.py b/tests/test_tidalindicators.py index 42e2ed9..ed4cfea 100644 --- a/tests/test_tidalindicators.py +++ b/tests/test_tidalindicators.py @@ -9,9 +9,9 @@ @pytest.mark.unittest def test_calc_HWLWtidalrange(df_ext_12_2010): - ts_ext_range = kw.calc_HWLWtidalrange(df_ext_12_2010) + df_ext_range = kw.calc_HWLWtidalrange(df_ext_12_2010) - ranges = ts_ext_range["tidalrange"].values + ranges = df_ext_range["tidalrange"].values vals_expected = np.array([1.89, 1.89, 1.87, 1.87, 1.97, 1.97, 2.05, 2.05, 2.05, 2.05]) assert len(ranges) == 1411 assert np.allclose(ranges[:10], vals_expected) @@ -193,7 +193,7 @@ def test_calc_hat_lat_frommeasurements_tooshortperiod(df_meas_2010_2014): @pytest.mark.unittest def test_calc_HWLWtidalrange_aggers_input(df_ext_2010): with pytest.raises(ValueError) as e: - kw.calc_HWLWtidalrange(ts_ext=df_ext_2010) + kw.calc_HWLWtidalrange(df_ext=df_ext_2010) assert "contains aggers" in str(e.value)