From 676ec3f9de8d6d3d747898000ed67598e923746e Mon Sep 17 00:00:00 2001 From: veenstrajelmer <60435591+veenstrajelmer@users.noreply.github.com> Date: Mon, 7 Oct 2024 12:49:08 +0200 Subject: [PATCH] 88 compute expected counts gives incorrect values for hanswt ext (#146) * exposed testcase with duplicated name * compute ext counts based on hw instead of all extremes, both in `calc_HWLWtidalindicators` and `calc_havengetallen` * floored expected counts * updated docstrings * updated whatsnew.md --- docs/whats-new.md | 1 + kenmerkendewaarden/havengetallen.py | 14 +++++--- kenmerkendewaarden/tidalindicators.py | 52 ++++++++++++++++----------- tests/test_tidalindicators.py | 15 ++++---- 4 files changed, 52 insertions(+), 30 deletions(-) diff --git a/docs/whats-new.md b/docs/whats-new.md index c3a087c..907164c 100644 --- a/docs/whats-new.md +++ b/docs/whats-new.md @@ -8,6 +8,7 @@ ### Fix - accomodate updated astrog datetime handling in hatyan 2.9.0 in [#141](https://github.com/Deltares-research/kenmerkendewaarden/pull/141) +- more stable coverage computation in `calc_HWLWtidalindicators()` and `calc_havengetallen()` in [#146](https://github.com/Deltares-research/kenmerkendewaarden/pull/146) ## 0.2.0 (2024-09-02) diff --git a/kenmerkendewaarden/havengetallen.py b/kenmerkendewaarden/havengetallen.py index 4467bfe..95e5106 100644 --- a/kenmerkendewaarden/havengetallen.py +++ b/kenmerkendewaarden/havengetallen.py @@ -50,7 +50,11 @@ def calc_havengetallen( return_df : bool Whether to return the enriched input dataframe. Default is False. min_coverage : float, optional - The minimal required coverage of the df_ext timeseries + The minimal required coverage (between 0 to 1) of the df_ext timeseries to + consider the statistics to be valid. It is the factor between the actual amount + and the expected amount of high waters in the series. Note that the expected + amount is not an exact extimate, so min_coverage=1 will probably result in nans + even though all extremes are present. The default is None. moonculm_offset : int, optional Offset between moonculmination and extremes. Passed on to `calc_HWLW_moonculm_combi`. The default is 4, which corresponds to a 2-day offset, which is applicable to the Dutch coast. @@ -65,14 +69,16 @@ def calc_havengetallen( """ raise_extremes_with_aggers(df_ext) - ser_ext = df_ext["values"] # check if coverage is high enough for havengetallen if min_coverage is not None: # TODO: compute_actual_counts only returns years for which there are no nans, so will have different length than expected counts if there is an all-nan year # TODO: if we supply 4 years of complete data instead of 10 years, no error is raised - df_actual_counts_peryear = compute_actual_counts(ser_ext, freq="Y") - df_expected_counts_peryear = compute_expected_counts(ser_ext, freq="Y") + data_pd_hw = df_ext.loc[df_ext["HWLWcode"] == 1]["values"] + df_actual_counts_peryear = compute_actual_counts(data_pd_hw, freq="Y") + df_expected_counts_peryear = compute_expected_counts(data_pd_hw, freq="Y") + # floor expected counts to avoid rounding issues + df_expected_counts_peryear = df_expected_counts_peryear.apply(np.floor) df_min_counts_peryear = df_expected_counts_peryear * min_coverage bool_coverage_toolow = df_actual_counts_peryear < df_min_counts_peryear df_debug = pd.DataFrame( diff --git a/kenmerkendewaarden/tidalindicators.py b/kenmerkendewaarden/tidalindicators.py index 1e728bf..46b6d8e 100644 --- a/kenmerkendewaarden/tidalindicators.py +++ b/kenmerkendewaarden/tidalindicators.py @@ -33,7 +33,11 @@ def calc_HWLWtidalindicators(df_ext: pd.DataFrame, min_coverage: float = None): df_ext : pd.DataFrame Dataframe with extremes timeseries. min_coverage : float, optional - The minimum percentage (from 0 to 1) of timeseries coverage to consider the statistics to be valid. The default is None. + The minimal required coverage (between 0 to 1) of the df_ext timeseries to + consider the statistics to be valid. It is the factor between the actual amount + and the expected amount of high waters in the series. Note that the expected + amount is not an exact extimate, so min_coverage=1 will probably result in nans + even though all extremes are present. The default is None. Returns ------- @@ -48,27 +52,26 @@ def calc_HWLWtidalindicators(df_ext: pd.DataFrame, min_coverage: float = None): raise_extremes_with_aggers(df_ext) # split to HW and LW separately, also groupby year - ser_ext = df_ext["values"] - data_pd_HW = df_ext.loc[df_ext["HWLWcode"] == 1]["values"] - data_pd_LW = df_ext.loc[df_ext["HWLWcode"] == 2]["values"] + data_pd_hw = df_ext.loc[df_ext["HWLWcode"] == 1]["values"] + data_pd_lw = df_ext.loc[df_ext["HWLWcode"] == 2]["values"] # yearmean HWLW from HWLW values #maybe also add *_mean_permonth - pi_hw_y = pd.PeriodIndex(data_pd_HW.index, freq="Y") - pi_lw_y = pd.PeriodIndex(data_pd_LW.index, freq="Y") - HW_mean_peryear = data_pd_HW.groupby(pi_hw_y).mean() - LW_mean_peryear = data_pd_LW.groupby(pi_lw_y).mean() + pi_hw_y = pd.PeriodIndex(data_pd_hw.index, freq="Y") + pi_lw_y = pd.PeriodIndex(data_pd_lw.index, freq="Y") + HW_mean_peryear = data_pd_hw.groupby(pi_hw_y).mean() + LW_mean_peryear = data_pd_lw.groupby(pi_lw_y).mean() # derive GHHW/GHWS (gemiddeld hoogwater springtij) per month - pi_hw_m = pd.PeriodIndex(data_pd_HW.index, freq="M") - pi_lw_m = pd.PeriodIndex(data_pd_LW.index, freq="M") + pi_hw_m = pd.PeriodIndex(data_pd_hw.index, freq="M") + pi_lw_m = pd.PeriodIndex(data_pd_lw.index, freq="M") # proxy for HW at spring tide - HW_monthmax_permonth = data_pd_HW.groupby(pi_hw_m).max() + HW_monthmax_permonth = data_pd_hw.groupby(pi_hw_m).max() # proxy for LW at spring tide - LW_monthmin_permonth = data_pd_LW.groupby(pi_lw_m).min() + LW_monthmin_permonth = data_pd_lw.groupby(pi_lw_m).min() # proxy for HW at neap tide - HW_monthmin_permonth = data_pd_HW.groupby(pi_hw_m).min() + HW_monthmin_permonth = data_pd_hw.groupby(pi_hw_m).min() # proxy for LW at neap tide - LW_monthmax_permonth = data_pd_LW.groupby(pi_lw_m).max() + LW_monthmax_permonth = data_pd_lw.groupby(pi_lw_m).max() ser_list = [HW_mean_peryear, LW_mean_peryear, HW_monthmax_permonth, LW_monthmin_permonth, @@ -81,12 +84,15 @@ def calc_HWLWtidalindicators(df_ext: pd.DataFrame, min_coverage: float = None): if min_coverage is not None: assert 0 <= min_coverage <= 1 # count timeseries values per year/month - ext_count_peryear = compute_actual_counts(ser_ext, freq="Y") - ext_count_permonth = compute_actual_counts(ser_ext, freq="M") + ext_count_peryear = compute_actual_counts(data_pd_hw, freq="Y") + ext_count_permonth = compute_actual_counts(data_pd_hw, freq="M") # compute expected counts and multiply with min_coverage to get minimal counts - min_count_peryear = compute_expected_counts(ser_ext, freq="Y") * min_coverage - min_count_permonth = compute_expected_counts(ser_ext, freq="M") * min_coverage + min_count_peryear = compute_expected_counts(data_pd_hw, freq="Y") + min_count_permonth = compute_expected_counts(data_pd_hw, freq="M") + # floor expected counts to avoid rounding issues + min_count_peryear = min_count_peryear.apply(np.floor) * min_coverage + min_count_permonth = min_count_permonth.apply(np.floor) * min_coverage # set all statistics that were based on too little values to nan HW_mean_peryear.loc[ext_count_peryear < min_count_peryear] = np.nan @@ -115,8 +121,8 @@ def calc_HWLWtidalindicators(df_ext: pd.DataFrame, min_coverage: float = None): LW_monthmin_mean_peryear = LW_monthmin_permonth.groupby(pi_lw_mmin_pm_y).mean() dict_tidalindicators = { - "HW_mean": data_pd_HW.mean(), # GHW - "LW_mean": data_pd_LW.mean(), # GLW + "HW_mean": data_pd_hw.mean(), # GHW + "LW_mean": data_pd_lw.mean(), # GLW "HW_mean_peryear": HW_mean_peryear, # GHW peryear "LW_mean_peryear": LW_mean_peryear, # GLW peryear "HW_monthmax_permonth": HW_monthmax_permonth, # GHHW/GHWS permonth @@ -205,6 +211,12 @@ def compute_expected_counts(ser_meas, freq): """ Compute the expected number of values for all years/months in a timeseries index, by taking the number of days for each year/month and dividing it by the median frequency in that period. + This functionhas will result in incorrect values for months/years with only a value + on the first and last timestep, since the derived frequency will be 15/183 days days + and this will result in 2 expected counts. + Furthermore, if applied to extremes, it is advisable to compute the expected counts + on high waters only since these have a more stable period than alternating high/low + waters. Even then, the expected counts are an indication for extremes. """ # TODO: beware of series with e.g. only first and last value of month/year, this will result in freq=30days and then expected count of 2, it will pass even if there is almost no data df_meas = pd.DataFrame(ser_meas) diff --git a/tests/test_tidalindicators.py b/tests/test_tidalindicators.py index 78fe723..90f1406 100644 --- a/tests/test_tidalindicators.py +++ b/tests/test_tidalindicators.py @@ -44,8 +44,10 @@ def test_calc_HWLWtidalrange(df_ext_12_2010): def test_calc_HWLWtidalindicators(df_ext_12_2010_2014): ext_stats_notimezone = kw.calc_HWLWtidalindicators(df_ext_12_2010_2014.tz_localize(None)) ext_stats = kw.calc_HWLWtidalindicators(df_ext_12_2010_2014) - ext_stats_min = kw.calc_HWLWtidalindicators(df_ext_12_2010_2014, min_coverage=1) - + ext_stats_min100 = kw.calc_HWLWtidalindicators(df_ext_12_2010_2014, min_coverage=1) + ext_stats_min099 = kw.calc_HWLWtidalindicators(df_ext_12_2010_2014, min_coverage=0.99) + ext_stats_min098 = kw.calc_HWLWtidalindicators(df_ext_12_2010_2014, min_coverage=0.98) + expected_keys = ['HW_mean', 'LW_mean', 'HW_mean_peryear', 'LW_mean_peryear', 'HW_monthmax_permonth', 'LW_monthmin_permonth', @@ -54,11 +56,12 @@ def test_calc_HWLWtidalindicators(df_ext_12_2010_2014): for key in expected_keys: assert key in ext_stats.keys() assert (ext_stats[key] == ext_stats_notimezone[key]).all() - - + assert ext_stats_notimezone['HW_monthmax_permonth'].isnull().sum() == 0 assert ext_stats['HW_monthmax_permonth'].isnull().sum() == 0 - assert ext_stats_min['HW_monthmax_permonth'].isnull().sum() == 13 + assert ext_stats_min100['HW_monthmax_permonth'].isnull().sum() == 1 + assert ext_stats_min099['HW_monthmax_permonth'].isnull().sum() == 1 + assert ext_stats_min098['HW_monthmax_permonth'].isnull().sum() == 0 @pytest.mark.unittest @@ -185,7 +188,7 @@ def test_compute_expected_counts_twotimesteps(df_meas_2010_2014): @pytest.mark.unittest -def test_calc_wltidalindicators(df_ext_12_2010_2014): +def test_calc_wltidalindicators_ext(df_ext_12_2010_2014): ext_stats_notimezone = kw.calc_HWLWtidalindicators( df_ext_12_2010_2014.tz_localize(None) )