From 676ec3f9de8d6d3d747898000ed67598e923746e Mon Sep 17 00:00:00 2001
From: veenstrajelmer <60435591+veenstrajelmer@users.noreply.github.com>
Date: Mon, 7 Oct 2024 12:49:08 +0200
Subject: [PATCH] 88 compute expected counts gives incorrect values for hanswt
 ext (#146)

* exposed testcase with duplicated name

* compute ext counts based on hw instead of all extremes, both in `calc_HWLWtidalindicators` and `calc_havengetallen`

* floored expected counts

* updated docstrings

* updated whatsnew.md
---
 docs/whats-new.md                     |  1 +
 kenmerkendewaarden/havengetallen.py   | 14 +++++---
 kenmerkendewaarden/tidalindicators.py | 52 ++++++++++++++++-----------
 tests/test_tidalindicators.py         | 15 ++++----
 4 files changed, 52 insertions(+), 30 deletions(-)

diff --git a/docs/whats-new.md b/docs/whats-new.md
index c3a087c..907164c 100644
--- a/docs/whats-new.md
+++ b/docs/whats-new.md
@@ -8,6 +8,7 @@
 
 ### Fix
 - accomodate updated astrog datetime handling in hatyan 2.9.0 in [#141](https://github.com/Deltares-research/kenmerkendewaarden/pull/141)
+- more stable coverage computation in `calc_HWLWtidalindicators()` and `calc_havengetallen()` in [#146](https://github.com/Deltares-research/kenmerkendewaarden/pull/146)
 
 
 ## 0.2.0 (2024-09-02)
diff --git a/kenmerkendewaarden/havengetallen.py b/kenmerkendewaarden/havengetallen.py
index 4467bfe..95e5106 100644
--- a/kenmerkendewaarden/havengetallen.py
+++ b/kenmerkendewaarden/havengetallen.py
@@ -50,7 +50,11 @@ def calc_havengetallen(
     return_df : bool
         Whether to return the enriched input dataframe. Default is False.
     min_coverage : float, optional
-        The minimal required coverage of the df_ext timeseries
+        The minimal required coverage (between 0 to 1) of the df_ext timeseries to
+        consider the statistics to be valid. It is the factor between the actual amount
+        and the expected amount of high waters in the series. Note that the expected
+        amount is not an exact extimate, so min_coverage=1 will probably result in nans
+        even though all extremes are present. The default is None.
     moonculm_offset : int, optional
         Offset between moonculmination and extremes. Passed on to `calc_HWLW_moonculm_combi`.
         The default is 4, which corresponds to a 2-day offset, which is applicable to the Dutch coast.
@@ -65,14 +69,16 @@ def calc_havengetallen(
 
     """
     raise_extremes_with_aggers(df_ext)
-    ser_ext = df_ext["values"]
 
     # check if coverage is high enough for havengetallen
     if min_coverage is not None:
         # TODO: compute_actual_counts only returns years for which there are no nans, so will have different length than expected counts if there is an all-nan year
         # TODO: if we supply 4 years of complete data instead of 10 years, no error is raised
-        df_actual_counts_peryear = compute_actual_counts(ser_ext, freq="Y")
-        df_expected_counts_peryear = compute_expected_counts(ser_ext, freq="Y")
+        data_pd_hw = df_ext.loc[df_ext["HWLWcode"] == 1]["values"]
+        df_actual_counts_peryear = compute_actual_counts(data_pd_hw, freq="Y")
+        df_expected_counts_peryear = compute_expected_counts(data_pd_hw, freq="Y")
+        # floor expected counts to avoid rounding issues
+        df_expected_counts_peryear = df_expected_counts_peryear.apply(np.floor)
         df_min_counts_peryear = df_expected_counts_peryear * min_coverage
         bool_coverage_toolow = df_actual_counts_peryear < df_min_counts_peryear
         df_debug = pd.DataFrame(
diff --git a/kenmerkendewaarden/tidalindicators.py b/kenmerkendewaarden/tidalindicators.py
index 1e728bf..46b6d8e 100644
--- a/kenmerkendewaarden/tidalindicators.py
+++ b/kenmerkendewaarden/tidalindicators.py
@@ -33,7 +33,11 @@ def calc_HWLWtidalindicators(df_ext: pd.DataFrame, min_coverage: float = None):
     df_ext : pd.DataFrame
         Dataframe with extremes timeseries.
     min_coverage : float, optional
-        The minimum percentage (from 0 to 1) of timeseries coverage to consider the statistics to be valid. The default is None.
+        The minimal required coverage (between 0 to 1) of the df_ext timeseries to
+        consider the statistics to be valid. It is the factor between the actual amount
+        and the expected amount of high waters in the series. Note that the expected
+        amount is not an exact extimate, so min_coverage=1 will probably result in nans
+        even though all extremes are present. The default is None.
 
     Returns
     -------
@@ -48,27 +52,26 @@ def calc_HWLWtidalindicators(df_ext: pd.DataFrame, min_coverage: float = None):
     raise_extremes_with_aggers(df_ext)
 
     # split to HW and LW separately, also groupby year
-    ser_ext = df_ext["values"]
-    data_pd_HW = df_ext.loc[df_ext["HWLWcode"] == 1]["values"]
-    data_pd_LW = df_ext.loc[df_ext["HWLWcode"] == 2]["values"]
+    data_pd_hw = df_ext.loc[df_ext["HWLWcode"] == 1]["values"]
+    data_pd_lw = df_ext.loc[df_ext["HWLWcode"] == 2]["values"]
 
     # yearmean HWLW from HWLW values #maybe also add *_mean_permonth
-    pi_hw_y = pd.PeriodIndex(data_pd_HW.index, freq="Y")
-    pi_lw_y = pd.PeriodIndex(data_pd_LW.index, freq="Y")
-    HW_mean_peryear = data_pd_HW.groupby(pi_hw_y).mean()
-    LW_mean_peryear = data_pd_LW.groupby(pi_lw_y).mean()
+    pi_hw_y = pd.PeriodIndex(data_pd_hw.index, freq="Y")
+    pi_lw_y = pd.PeriodIndex(data_pd_lw.index, freq="Y")
+    HW_mean_peryear = data_pd_hw.groupby(pi_hw_y).mean()
+    LW_mean_peryear = data_pd_lw.groupby(pi_lw_y).mean()
 
     # derive GHHW/GHWS (gemiddeld hoogwater springtij) per month
-    pi_hw_m = pd.PeriodIndex(data_pd_HW.index, freq="M")
-    pi_lw_m = pd.PeriodIndex(data_pd_LW.index, freq="M")
+    pi_hw_m = pd.PeriodIndex(data_pd_hw.index, freq="M")
+    pi_lw_m = pd.PeriodIndex(data_pd_lw.index, freq="M")
     # proxy for HW at spring tide
-    HW_monthmax_permonth = data_pd_HW.groupby(pi_hw_m).max()
+    HW_monthmax_permonth = data_pd_hw.groupby(pi_hw_m).max()
     # proxy for LW at spring tide
-    LW_monthmin_permonth = data_pd_LW.groupby(pi_lw_m).min()
+    LW_monthmin_permonth = data_pd_lw.groupby(pi_lw_m).min()
     # proxy for HW at neap tide
-    HW_monthmin_permonth = data_pd_HW.groupby(pi_hw_m).min()
+    HW_monthmin_permonth = data_pd_hw.groupby(pi_hw_m).min()
     # proxy for LW at neap tide
-    LW_monthmax_permonth = data_pd_LW.groupby(pi_lw_m).max()
+    LW_monthmax_permonth = data_pd_lw.groupby(pi_lw_m).max()
 
     ser_list = [HW_mean_peryear, LW_mean_peryear,
                 HW_monthmax_permonth, LW_monthmin_permonth, 
@@ -81,12 +84,15 @@ def calc_HWLWtidalindicators(df_ext: pd.DataFrame, min_coverage: float = None):
     if min_coverage is not None:
         assert 0 <= min_coverage <= 1
         # count timeseries values per year/month
-        ext_count_peryear = compute_actual_counts(ser_ext, freq="Y")
-        ext_count_permonth = compute_actual_counts(ser_ext, freq="M")
+        ext_count_peryear = compute_actual_counts(data_pd_hw, freq="Y")
+        ext_count_permonth = compute_actual_counts(data_pd_hw, freq="M")
 
         # compute expected counts and multiply with min_coverage to get minimal counts
-        min_count_peryear = compute_expected_counts(ser_ext, freq="Y") * min_coverage
-        min_count_permonth = compute_expected_counts(ser_ext, freq="M") * min_coverage
+        min_count_peryear = compute_expected_counts(data_pd_hw, freq="Y")
+        min_count_permonth = compute_expected_counts(data_pd_hw, freq="M")
+        # floor expected counts to avoid rounding issues
+        min_count_peryear = min_count_peryear.apply(np.floor) * min_coverage
+        min_count_permonth = min_count_permonth.apply(np.floor) * min_coverage
 
         # set all statistics that were based on too little values to nan
         HW_mean_peryear.loc[ext_count_peryear < min_count_peryear] = np.nan
@@ -115,8 +121,8 @@ def calc_HWLWtidalindicators(df_ext: pd.DataFrame, min_coverage: float = None):
     LW_monthmin_mean_peryear = LW_monthmin_permonth.groupby(pi_lw_mmin_pm_y).mean()
     
     dict_tidalindicators = {
-        "HW_mean": data_pd_HW.mean(),  # GHW
-        "LW_mean": data_pd_LW.mean(),  # GLW
+        "HW_mean": data_pd_hw.mean(),  # GHW
+        "LW_mean": data_pd_lw.mean(),  # GLW
         "HW_mean_peryear": HW_mean_peryear,  # GHW peryear
         "LW_mean_peryear": LW_mean_peryear,  # GLW peryear
         "HW_monthmax_permonth": HW_monthmax_permonth,  # GHHW/GHWS permonth
@@ -205,6 +211,12 @@ def compute_expected_counts(ser_meas, freq):
     """
     Compute the expected number of values for all years/months in a timeseries index,
     by taking the number of days for each year/month and dividing it by the median frequency in that period.
+    This functionhas will result in incorrect values for months/years with only a value
+    on the first and last timestep, since the derived frequency will be 15/183 days days
+    and this will result in 2 expected counts.
+    Furthermore, if applied to extremes, it is advisable to compute the expected counts
+    on high waters only since these have a more stable period than alternating high/low
+    waters. Even then, the expected counts are an indication for extremes.
     """
     # TODO: beware of series with e.g. only first and last value of month/year, this will result in freq=30days and then expected count of 2, it will pass even if there is almost no data
     df_meas = pd.DataFrame(ser_meas)
diff --git a/tests/test_tidalindicators.py b/tests/test_tidalindicators.py
index 78fe723..90f1406 100644
--- a/tests/test_tidalindicators.py
+++ b/tests/test_tidalindicators.py
@@ -44,8 +44,10 @@ def test_calc_HWLWtidalrange(df_ext_12_2010):
 def test_calc_HWLWtidalindicators(df_ext_12_2010_2014):
     ext_stats_notimezone = kw.calc_HWLWtidalindicators(df_ext_12_2010_2014.tz_localize(None))
     ext_stats = kw.calc_HWLWtidalindicators(df_ext_12_2010_2014)
-    ext_stats_min = kw.calc_HWLWtidalindicators(df_ext_12_2010_2014, min_coverage=1)
-    
+    ext_stats_min100 = kw.calc_HWLWtidalindicators(df_ext_12_2010_2014, min_coverage=1)
+    ext_stats_min099 = kw.calc_HWLWtidalindicators(df_ext_12_2010_2014, min_coverage=0.99)
+    ext_stats_min098 = kw.calc_HWLWtidalindicators(df_ext_12_2010_2014, min_coverage=0.98)
+
     expected_keys = ['HW_mean', 'LW_mean', 
                      'HW_mean_peryear', 'LW_mean_peryear', 
                      'HW_monthmax_permonth', 'LW_monthmin_permonth', 
@@ -54,11 +56,12 @@ def test_calc_HWLWtidalindicators(df_ext_12_2010_2014):
     for key in expected_keys:
         assert key in ext_stats.keys()
         assert (ext_stats[key] == ext_stats_notimezone[key]).all()
-        
-        
+
     assert ext_stats_notimezone['HW_monthmax_permonth'].isnull().sum() == 0
     assert ext_stats['HW_monthmax_permonth'].isnull().sum() == 0
-    assert ext_stats_min['HW_monthmax_permonth'].isnull().sum() == 13
+    assert ext_stats_min100['HW_monthmax_permonth'].isnull().sum() == 1
+    assert ext_stats_min099['HW_monthmax_permonth'].isnull().sum() == 1
+    assert ext_stats_min098['HW_monthmax_permonth'].isnull().sum() == 0
 
 
 @pytest.mark.unittest
@@ -185,7 +188,7 @@ def test_compute_expected_counts_twotimesteps(df_meas_2010_2014):
 
 
 @pytest.mark.unittest
-def test_calc_wltidalindicators(df_ext_12_2010_2014):
+def test_calc_wltidalindicators_ext(df_ext_12_2010_2014):
     ext_stats_notimezone = kw.calc_HWLWtidalindicators(
         df_ext_12_2010_2014.tz_localize(None)
     )