add fix for unequal lengths (#87)

* add fix for unequal lengths * added testcase * updated whatsnew
Deltares-research · Jun 20, 2024 · 8bb04fa · 8bb04fa
1 parent 53f9049
commit 8bb04fa
Show file tree

Hide file tree

Showing 3 changed files with 55 additions and 4 deletions.
diff --git a/docs/whats-new.md b/docs/whats-new.md
@@ -22,6 +22,7 @@
 
 ### Fix
 - implemented workaround for pandas 2.2.0 with different rounding behaviour in [#69](https://github.com/Deltares-research/kenmerkendewaarden/pull/69)
+- fixed different lengths of `compute_expected_counts()` and `compute_actual_counts()` in case of all-nan periods in [#87](https://github.com/Deltares-research/kenmerkendewaarden/pull/87)
 
 
 ## 0.1.0 (2024-03-11)

diff --git a/kenmerkendewaarden/tidalindicators.py b/kenmerkendewaarden/tidalindicators.py
@@ -165,9 +165,9 @@ def compute_actual_counts(df_meas, freq, column="values"):
     """
     Compute the number of non-nan values in a column for all years/months in a timeseries index.
     """
-    df_meas_nonan = df_meas.loc[~df_meas[column].isnull()]
-    period_index = pd.PeriodIndex(df_meas_nonan.index, freq=freq)
-    df_actual_counts = df_meas_nonan.groupby(period_index)[column].count()
+    df_meas_isnotnull = ~df_meas[column].isnull()
+    period_index = pd.PeriodIndex(df_meas_isnotnull.index, freq=freq)
+    df_actual_counts = df_meas_isnotnull.groupby(period_index).sum()
     return df_actual_counts
 
 
@@ -176,7 +176,7 @@ def compute_expected_counts(df_meas, freq):
     Compute the expected number of values for all years/months in a timeseries index,
     by taking the number of days for each year/month and dividing it by the median frequency in that period.
     """
-    # TODO: beware of series with e.g. only first and last value of month, this will result in freq=30days and then expected count of 1, it will pass even if there is almost no data
+    # TODO: beware of series with e.g. only first and last value of month/year, this will result in freq=30days and then expected count of 2, it will pass even if there is almost no data
     df_meas = df_meas.copy()
     df_meas["timediff"] = pd.TimedeltaIndex([pd.NaT]).append(df_meas.index[1:] - df_meas.index[:-1]) # TODO: from pandas>=2.1.4 the following also works: df_times.diff() (which results in a timedeltaindex of the correct length)
     period_index = pd.PeriodIndex(df_meas.index, freq=freq)

diff --git a/tests/test_tidalindicators.py b/tests/test_tidalindicators.py
@@ -3,6 +3,8 @@
 import pytest
 import kenmerkendewaarden as kw
 import numpy as np
+from kenmerkendewaarden.tidalindicators import compute_actual_counts, compute_expected_counts
+import pandas as pd
 
 
 @pytest.mark.unittest
@@ -59,6 +61,54 @@ def test_calc_wltidalindicators_mincount(df_meas_2010_2014):
     assert slotgemiddelden_dict_withgap_lower_threshold["wl_mean_peryear"].isnull().sum() == 0
 
 
+@pytest.mark.unittest
+def test_compute_expected_actual_counts_samelenght(df_meas_2010_2014):
+    """
+    because of nan-dropping, the lenghts were not the same before
+    this test makes sure this does not happen again
+    https://github.com/Deltares-research/kenmerkendewaarden/issues/83
+    """
+    # create dataset with a gap
+    df_meas_withgap = df_meas_2010_2014.copy() # copy to prevent altering the original dataset
+    df_meas_withgap.loc["2012", "values"] = np.nan
+    df_meas_withgap.loc["2012", "qualitycode"] = 99
+
+    # compute actual and expected counts
+    actual_count_peryear = compute_actual_counts(df_meas_withgap, freq="Y")
+    actual_count_permonth = compute_actual_counts(df_meas_withgap, freq="M")
+    expected_count_peryear = compute_expected_counts(df_meas_withgap, freq="Y")
+    expected_count_permonth = compute_expected_counts(df_meas_withgap, freq="M")
+
+    assert len(actual_count_peryear) == len(expected_count_peryear)
+    assert len(actual_count_permonth) == len(expected_count_permonth)
+
+
+@pytest.mark.unittest
+def test_compute_expected_counts_twotimesteps(df_meas_2010_2014):
+    """
+    this testcase shows that compute_expected_counts succeeds for a year with only three timesteps
+    and it fails for a year with two timesteps.
+    """
+
+    # create datasets with a gap
+    df_meas_withgap_success = pd.concat([df_meas_2010_2014.loc[:"2012-01-01 00:10:00 +01:00"],
+                                         df_meas_2010_2014.loc["2012-12-31 23:50:00 +01:00":]], axis=0)
+    df_meas_withgap_fails = pd.concat([df_meas_2010_2014.loc[:"2012-01-01 00:00:00 +01:00"],
+                                       df_meas_2010_2014.loc["2012-12-31 23:50:00 +01:00":]], axis=0)
+    assert len(df_meas_withgap_success.loc["2012"]) == 3
+    assert len(df_meas_withgap_fails.loc["2012"]) == 2
+
+    # compute expected counts
+    expected_count_peryear_success = compute_expected_counts(df_meas_withgap_success, freq="Y")
+    expected_count_peryear_fails = compute_expected_counts(df_meas_withgap_fails, freq="Y")
+
+    count_peryear_success = np.array([52560., 52560., 52704., 52560., 52560.])
+    count_peryear_failed = np.array([52560., 52560., 2., 52560., 52560.])
+
+    assert np.allclose(expected_count_peryear_success.values, count_peryear_success)
+    assert np.allclose(expected_count_peryear_fails.values, count_peryear_failed)
+
+
 @pytest.mark.unittest
 def test_calc_wltidalindicators(df_ext_12_2010_2014):
     ext_stats_notimezone = kw.calc_HWLWtidalindicators(df_ext_12_2010_2014.tz_localize(None))
-Original file line number
+Diff line change
@@ Expand Up / @@ -22,6 +22,7 @@ @@
     ### Fix
     - implemented workaround for pandas 2.2.0 with different rounding behaviour in [#69](https://github.com/Deltares-research/kenmerkendewaarden/pull/69)
+    - fixed different lengths of `compute_expected_counts()` and `compute_actual_counts()` in case of all-nan periods in [#87](https://github.com/Deltares-research/kenmerkendewaarden/pull/87)
     ## 0.1.0 (2024-03-11)
@@ Expand Down @@