Skip to content

Commit

Permalink
add fix for unequal lengths (#87)
Browse files Browse the repository at this point in the history
* add fix for unequal lengths

* added testcase

* updated whatsnew
  • Loading branch information
veenstrajelmer authored Jun 20, 2024
1 parent 53f9049 commit 8bb04fa
Show file tree
Hide file tree
Showing 3 changed files with 55 additions and 4 deletions.
1 change: 1 addition & 0 deletions docs/whats-new.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@

### Fix
- implemented workaround for pandas 2.2.0 with different rounding behaviour in [#69](https://github.com/Deltares-research/kenmerkendewaarden/pull/69)
- fixed different lengths of `compute_expected_counts()` and `compute_actual_counts()` in case of all-nan periods in [#87](https://github.com/Deltares-research/kenmerkendewaarden/pull/87)


## 0.1.0 (2024-03-11)
Expand Down
8 changes: 4 additions & 4 deletions kenmerkendewaarden/tidalindicators.py
Original file line number Diff line number Diff line change
Expand Up @@ -165,9 +165,9 @@ def compute_actual_counts(df_meas, freq, column="values"):
"""
Compute the number of non-nan values in a column for all years/months in a timeseries index.
"""
df_meas_nonan = df_meas.loc[~df_meas[column].isnull()]
period_index = pd.PeriodIndex(df_meas_nonan.index, freq=freq)
df_actual_counts = df_meas_nonan.groupby(period_index)[column].count()
df_meas_isnotnull = ~df_meas[column].isnull()
period_index = pd.PeriodIndex(df_meas_isnotnull.index, freq=freq)
df_actual_counts = df_meas_isnotnull.groupby(period_index).sum()
return df_actual_counts


Expand All @@ -176,7 +176,7 @@ def compute_expected_counts(df_meas, freq):
Compute the expected number of values for all years/months in a timeseries index,
by taking the number of days for each year/month and dividing it by the median frequency in that period.
"""
# TODO: beware of series with e.g. only first and last value of month, this will result in freq=30days and then expected count of 1, it will pass even if there is almost no data
# TODO: beware of series with e.g. only first and last value of month/year, this will result in freq=30days and then expected count of 2, it will pass even if there is almost no data
df_meas = df_meas.copy()
df_meas["timediff"] = pd.TimedeltaIndex([pd.NaT]).append(df_meas.index[1:] - df_meas.index[:-1]) # TODO: from pandas>=2.1.4 the following also works: df_times.diff() (which results in a timedeltaindex of the correct length)
period_index = pd.PeriodIndex(df_meas.index, freq=freq)
Expand Down
50 changes: 50 additions & 0 deletions tests/test_tidalindicators.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
import pytest
import kenmerkendewaarden as kw
import numpy as np
from kenmerkendewaarden.tidalindicators import compute_actual_counts, compute_expected_counts
import pandas as pd


@pytest.mark.unittest
Expand Down Expand Up @@ -59,6 +61,54 @@ def test_calc_wltidalindicators_mincount(df_meas_2010_2014):
assert slotgemiddelden_dict_withgap_lower_threshold["wl_mean_peryear"].isnull().sum() == 0


@pytest.mark.unittest
def test_compute_expected_actual_counts_samelenght(df_meas_2010_2014):
"""
because of nan-dropping, the lenghts were not the same before
this test makes sure this does not happen again
https://github.com/Deltares-research/kenmerkendewaarden/issues/83
"""
# create dataset with a gap
df_meas_withgap = df_meas_2010_2014.copy() # copy to prevent altering the original dataset
df_meas_withgap.loc["2012", "values"] = np.nan
df_meas_withgap.loc["2012", "qualitycode"] = 99

# compute actual and expected counts
actual_count_peryear = compute_actual_counts(df_meas_withgap, freq="Y")
actual_count_permonth = compute_actual_counts(df_meas_withgap, freq="M")
expected_count_peryear = compute_expected_counts(df_meas_withgap, freq="Y")
expected_count_permonth = compute_expected_counts(df_meas_withgap, freq="M")

assert len(actual_count_peryear) == len(expected_count_peryear)
assert len(actual_count_permonth) == len(expected_count_permonth)


@pytest.mark.unittest
def test_compute_expected_counts_twotimesteps(df_meas_2010_2014):
"""
this testcase shows that compute_expected_counts succeeds for a year with only three timesteps
and it fails for a year with two timesteps.
"""

# create datasets with a gap
df_meas_withgap_success = pd.concat([df_meas_2010_2014.loc[:"2012-01-01 00:10:00 +01:00"],
df_meas_2010_2014.loc["2012-12-31 23:50:00 +01:00":]], axis=0)
df_meas_withgap_fails = pd.concat([df_meas_2010_2014.loc[:"2012-01-01 00:00:00 +01:00"],
df_meas_2010_2014.loc["2012-12-31 23:50:00 +01:00":]], axis=0)
assert len(df_meas_withgap_success.loc["2012"]) == 3
assert len(df_meas_withgap_fails.loc["2012"]) == 2

# compute expected counts
expected_count_peryear_success = compute_expected_counts(df_meas_withgap_success, freq="Y")
expected_count_peryear_fails = compute_expected_counts(df_meas_withgap_fails, freq="Y")

count_peryear_success = np.array([52560., 52560., 52704., 52560., 52560.])
count_peryear_failed = np.array([52560., 52560., 2., 52560., 52560.])

assert np.allclose(expected_count_peryear_success.values, count_peryear_success)
assert np.allclose(expected_count_peryear_fails.values, count_peryear_failed)


@pytest.mark.unittest
def test_calc_wltidalindicators(df_ext_12_2010_2014):
ext_stats_notimezone = kw.calc_HWLWtidalindicators(df_ext_12_2010_2014.tz_localize(None))
Expand Down

0 comments on commit 8bb04fa

Please sign in to comment.