Skip to content

Commit

Permalink
added .dt accessor to fix the rounding issue (#69)
Browse files Browse the repository at this point in the history
* added .dt accessor to all datetime/timedelta rounding to fix the rounding issue

* added additional tests

* released pandas version constraint

* updated whatsnew

* rounding S to s and y to Y to fix futurewarnings
  • Loading branch information
veenstrajelmer authored Jun 14, 2024
1 parent 6105b96 commit d6f888a
Show file tree
Hide file tree
Showing 5 changed files with 36 additions and 12 deletions.
3 changes: 3 additions & 0 deletions docs/whats-new.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,9 @@
- clipping of timeseries on physical breaks with `kw.data_retrieve.clip_timeseries_physical_break()` (private) in [#61](https://github.com/Deltares-research/kenmerkendewaarden/pull/61) and [#64](https://github.com/Deltares-research/kenmerkendewaarden/pull/64)
- added dedicated plotting functions in [#64](https://github.com/Deltares-research/kenmerkendewaarden/pull/64), [#66](https://github.com/Deltares-research/kenmerkendewaarden/pull/66) and [#68](https://github.com/Deltares-research/kenmerkendewaarden/pull/68)

### Fix
- implemented workaround for pandas 2.2.0 with different rounding behaviour in [#69](https://github.com/Deltares-research/kenmerkendewaarden/pull/69)


## 0.1.0 (2024-03-11)
This is the set of kenmerkende waarden kust scripts and functions as transfered from hatyan and how they were applied in the kwk-2022 project.
Expand Down
4 changes: 2 additions & 2 deletions kenmerkendewaarden/data_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,8 +71,8 @@ def plot_measurements(df, df_ext=None):
data_pd_LW = df_ext.loc[df_ext['HWLWcode'].isin([2,3,5])]

# calculate monthly/yearly mean for meas ext data
HW_mean_peryear_long = data_pd_HW.groupby(pd.PeriodIndex(data_pd_HW.index, freq="y"))['values'].mean()
LW_mean_peryear_long = data_pd_LW.groupby(pd.PeriodIndex(data_pd_LW.index, freq="y"))['values'].mean()
HW_mean_peryear_long = data_pd_HW.groupby(pd.PeriodIndex(data_pd_HW.index, freq="Y"))['values'].mean()
LW_mean_peryear_long = data_pd_LW.groupby(pd.PeriodIndex(data_pd_LW.index, freq="Y"))['values'].mean()

ax1.plot(HW_mean_peryear_long,'m',linewidth=0.7, label=None)
ax1.plot(LW_mean_peryear_long,'m',linewidth=0.7, label=None)
Expand Down
16 changes: 9 additions & 7 deletions kenmerkendewaarden/havengetallen.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ def calc_HWLW_moonculm_combi(data_pd_HWLW_12,culm_addtime=None):
data_pd_HWLW_idxHWLWno.loc[HW_bool,'getijperiod'] = data_pd_HWLW_idxHWLWno.loc[HW_bool,'times'].iloc[1:].values - data_pd_HWLW_idxHWLWno.loc[HW_bool,'times'].iloc[:-1] #this works properly since index is HWLW
data_pd_HWLW_idxHWLWno.loc[HW_bool,'duurdaling'] = data_pd_HWLW_idxHWLWno.loc[~HW_bool,'times'] - data_pd_HWLW_idxHWLWno.loc[HW_bool,'times']
data_pd_HWLW_idxHWLWno['culm_time'] = moonculm_idxHWLWno['datetime'] #couple HWLW to moonculminations two days earlier (this works since index is HWLWno)
data_pd_HWLW_idxHWLWno['culm_hr'] = (data_pd_HWLW_idxHWLWno['culm_time'].round('h').dt.hour)%12
data_pd_HWLW_idxHWLWno['culm_hr'] = (data_pd_HWLW_idxHWLWno['culm_time'].dt.round('h').dt.hour)%12
data_pd_HWLW_idxHWLWno['HWLW_delay'] = data_pd_HWLW_idxHWLWno['times'] - data_pd_HWLW_idxHWLWno['culm_time']
if culm_addtime is not None:
data_pd_HWLW_idxHWLWno['HWLW_delay'] -= culm_addtime
Expand All @@ -110,17 +110,19 @@ def calc_HWLW_culmhr_summary(data_pd_HWLW):

HWLW_culmhr_summary = pd.DataFrame()
HWLW_culmhr_summary['HW_values_median'] = data_pd_HW.groupby(data_pd_HW['culm_hr'])['values'].median()
HWLW_culmhr_summary['HW_delay_median'] = data_pd_HW.groupby(data_pd_HW['culm_hr'])['HWLW_delay'].median().round('S')
HWLW_culmhr_summary['HW_delay_median'] = data_pd_HW.groupby(data_pd_HW['culm_hr'])['HWLW_delay'].median()
HWLW_culmhr_summary['LW_values_median'] = data_pd_LW.groupby(data_pd_LW['culm_hr'])['values'].median()
HWLW_culmhr_summary['LW_delay_median'] = data_pd_LW.groupby(data_pd_LW['culm_hr'])['HWLW_delay'].median().round('S')
HWLW_culmhr_summary['LW_delay_median'] = data_pd_LW.groupby(data_pd_LW['culm_hr'])['HWLW_delay'].median()
HWLW_culmhr_summary['tijverschil'] = HWLW_culmhr_summary['HW_values_median'] - HWLW_culmhr_summary['LW_values_median']
HWLW_culmhr_summary['getijperiod_median'] = data_pd_HW.groupby(data_pd_HW['culm_hr'])['getijperiod'].median().round('S')
HWLW_culmhr_summary['duurdaling_median'] = data_pd_HW.groupby(data_pd_HW['culm_hr'])['duurdaling'].median().round('S')
HWLW_culmhr_summary['getijperiod_median'] = data_pd_HW.groupby(data_pd_HW['culm_hr'])['getijperiod'].median()
HWLW_culmhr_summary['duurdaling_median'] = data_pd_HW.groupby(data_pd_HW['culm_hr'])['duurdaling'].median()

HWLW_culmhr_summary.loc['mean',:] = HWLW_culmhr_summary.mean() #add mean row to dataframe (not convenient to add immediately due to plotting with index 0-11)
for colname in HWLW_culmhr_summary.columns: #round timedelta to make outputformat nicer

# round all timedeltas to seconds to make outputformat nicer
for colname in HWLW_culmhr_summary.columns:
if HWLW_culmhr_summary[colname].dtype == 'timedelta64[ns]':
HWLW_culmhr_summary[colname] = HWLW_culmhr_summary[colname].round('S')
HWLW_culmhr_summary[colname] = HWLW_culmhr_summary[colname].dt.round('s')

return HWLW_culmhr_summary

Expand Down
2 changes: 0 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,6 @@ dependencies = [
"numpy>=1.22.0",
#pandas<2.0.0 supports non-nanosecond timestep and therefore larger min/max range
"pandas>=2.0.0",
#pandas>=2.2.0 results in different values for havengetallen: https://github.com/Deltares-research/kenmerkendewaarden/issues/53
"pandas<2.2.0",
#matplotlib<3.5.2 raises "ValueError: Multi-dimensional indexing" in hatyan.plot_timeseries()
"matplotlib>=3.5.2",
#netcdf4<1.5.4 pip install fails in py39
Expand Down
23 changes: 22 additions & 1 deletion tests/test_havengetallen.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,17 +8,38 @@
@pytest.mark.unittest
def test_havengetallen(df_ext_12_2010):
df_havengetallen, data_pd_hwlw = kw.calc_havengetallen(df_ext=df_ext_12_2010, return_df_ext=True)

# check if all expected columns are present
df_columns = ['HW_values_median', 'HW_delay_median', 'LW_values_median',
'LW_delay_median', 'tijverschil', 'getijperiod_median',
'duurdaling_median']
assert set(df_havengetallen.columns) == set(df_columns)

# check if mean row is present
assert len(df_havengetallen.index) == 13
assert "mean" in df_havengetallen.index

# check if extremes dataframe length has not changed
assert len(data_pd_hwlw) == len(df_ext_12_2010)

# assert the havengetallen values
hw_values_median = df_havengetallen["HW_values_median"].values
hw_values_median_expected = np.array([1.345, 1.31 , 1.225, 1.17 , 1.04 , 0.925, 0.865, 0.9 , 1.045,
1.135, 1.25 , 1.35 , 1.13 ])
assert np.allclose(hw_values_median, hw_values_median_expected)
assert len(data_pd_hwlw) == len(df_ext_12_2010)

# test time delays
hw_delay_median = df_havengetallen["HW_delay_median"].values.astype(float)
hw_delay_median_expected = np.array([5697000000000, 4763000000000, 3792000000000, 3230000000000,
2985000000000, 3729000000000, 5722000000000, 7830000000000,
8335000000000, 7995000000000, 7501000000000, 6628000000000,
5684000000000]) #nanoseconds representation
assert np.allclose(hw_delay_median, hw_delay_median_expected)

# test time rounding to seconds
for colname in df_havengetallen.columns: #round timedelta to make outputformat nicer
if df_havengetallen[colname].dtype == 'timedelta64[ns]':
assert(df_havengetallen[colname].dt.nanoseconds == 0).all()


@pytest.mark.unittest
Expand Down

0 comments on commit d6f888a

Please sign in to comment.