Skip to content

Commit

Permalink
58 central check for timeseries completeness (#76)
Browse files Browse the repository at this point in the history
* implemented compute_expected_counts to flexibly filter out periods with too little coverage

* better plots and fixed testcases

* simplified calc_tidalindicators

* make all periodindex of tidalindicator dataframes contiguous

* require minimal coverage of timeseries in havengetallen instead of example script

* made compatible with pandas<2.1.4
  • Loading branch information
veenstrajelmer authored Jun 14, 2024
1 parent d7f8f5f commit caaf77c
Show file tree
Hide file tree
Showing 9 changed files with 175 additions and 88 deletions.
19 changes: 6 additions & 13 deletions examples/KWK_process.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,22 +79,15 @@
data_pd_HWLW_all_12 = hatyan.calc_HWLW12345to12(data_pd_HWLW_all) #convert 12345 to 12 by taking minimum of 345 as 2 (laagste laagwater)
#crop timeseries to 10y
data_pd_HWLW_10y_12 = hatyan.crop_timeseries(data_pd_HWLW_all_12, times=slice(tstart_dt,tstop_dt),onlyfull=False)

#check if amount of HWs is enough
M2_period_timedelta = pd.Timedelta(hours=hatyan.schureman.get_schureman_freqs(['M2']).loc['M2','period [hr]'])
numHWs_expected = (tstop_dt-tstart_dt).total_seconds()/M2_period_timedelta.total_seconds()
numHWs = (data_pd_HWLW_10y_12['HWLWcode']==1).sum()
if numHWs < 0.95*numHWs_expected:
raise Exception(f'ERROR: not enough high waters present in period, {numHWs} instead of >=0.95*{int(numHWs_expected):d}')




#### TIDAL INDICATORS
if compute_indicators and data_pd_meas_all is not None and data_pd_HWLW_all is not None:
print(f'tidal indicators for {current_station}')
# compute and plot tidal indicators
dict_wltidalindicators = kw.calc_wltidalindicators(data_pd_meas_all)
dict_HWLWtidalindicators = kw.calc_HWLWtidalindicators(data_pd_HWLW_all_12)
dict_wltidalindicators = kw.calc_wltidalindicators(data_pd_meas_all, min_coverage=1)
dict_HWLWtidalindicators = kw.calc_HWLWtidalindicators(data_pd_HWLW_all_12, min_coverage=1)

# add hat/lat
df_meas_19y = data_pd_meas_all.loc["2001":"2019"]
Expand All @@ -118,11 +111,11 @@
# including years with too little values and years before physical break
slotgemiddelden_all = kw.calc_slotgemiddelden(df_meas=data_pd_meas_all.loc[:tstop_dt],
df_ext=data_pd_HWLW_all_12.loc[:tstop_dt],
only_valid=False, clip_physical_break=True)
min_coverage=0, clip_physical_break=True)
# only years with enough values and after potential physical break
slotgemiddelden_valid = kw.calc_slotgemiddelden(df_meas=data_pd_meas_all.loc[:tstop_dt],
df_ext=data_pd_HWLW_all_12.loc[:tstop_dt],
only_valid=True, clip_physical_break=True)
min_coverage=1, clip_physical_break=True)

# plot slotgemiddelden
fig1, ax1 = kw.plot_slotgemiddelden(slotgemiddelden_valid, slotgemiddelden_all)
Expand Down Expand Up @@ -181,7 +174,7 @@
if compute_gemgetij and data_pd_meas_all is not None and data_pd_HWLW_all is not None:

print(f'gem getijkrommen for {current_station}')
pred_freq = "10s" # frequency decides accuracy of tU/tD and other timings (and is writing freq of BOI timeseries)
pred_freq = "10s" # frequency influences the accuracy of havengetallen-scaling and is writing frequency of BOI timeseries

# derive getijkrommes: raw, scaled to havengetallen, scaled to havengetallen and 12h25min period
gemgetij_raw = kw.calc_gemiddeldgetij(df_meas=data_pd_meas_10y, df_ext=None,
Expand Down
2 changes: 1 addition & 1 deletion kenmerkendewaarden/data_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,7 +149,7 @@ def get_flat_meta_from_dataset(ds):
def get_stats_from_dataframe(df):
df_times = df.index
ts_dupltimes = df_times.duplicated()
ts_timediff = df_times[1:]-df_times[:-1] # from pandas 2.2.0 the following also works: df_times.diff()[1:]
ts_timediff = df_times[1:]-df_times[:-1] # TODO: from pandas 2.1.4 the following also works: df_times.diff()[1:]

ds_stats = {}
ds_stats['tstart'] = df_times.min()
Expand Down
2 changes: 1 addition & 1 deletion kenmerkendewaarden/gemiddeldgetij.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ def calc_gemiddeldgetij(df_meas: pd.DataFrame, df_ext: pd.DataFrame = None,
if scale_extremes:
if df_ext is None:
raise ValueError("df_ext should be provided if scale_extremes=True")
df_havengetallen = calc_havengetallen(df_ext=df_ext)
df_havengetallen = calc_havengetallen(df_ext=df_ext, min_coverage=None) # TODO: also provide min_coverage here?
HW_sp, LW_sp = df_havengetallen.loc[0,['HW_values_median','LW_values_median']] # spring
HW_np, LW_np = df_havengetallen.loc[6,['HW_values_median','LW_values_median']] # neap
HW_av, LW_av = df_havengetallen.loc['mean',['HW_values_median','LW_values_median']] # mean
Expand Down
23 changes: 21 additions & 2 deletions kenmerkendewaarden/havengetallen.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,10 @@
import logging
from hatyan.astrog import astrog_culminations
from hatyan.timeseries import calc_HWLWnumbering
from kenmerkendewaarden.tidalindicators import calc_HWLWtidalrange
from kenmerkendewaarden.tidalindicators import (calc_HWLWtidalrange,
compute_actual_counts,
compute_expected_counts,
)
from kenmerkendewaarden.utils import raise_extremes_with_aggers

__all__ = ["calc_havengetallen",
Expand All @@ -21,7 +24,7 @@
logger = logging.getLogger(__name__)


def calc_havengetallen(df_ext:pd.DataFrame, return_df_ext=False):
def calc_havengetallen(df_ext:pd.DataFrame, return_df_ext=False, min_coverage=None):
"""
havengetallen consist of the extreme (high and low) median values and the
extreme median time delays with respect to the moonculmination.
Expand All @@ -47,6 +50,22 @@ def calc_havengetallen(df_ext:pd.DataFrame, return_df_ext=False):
"""
raise_extremes_with_aggers(df_ext)

# check if coverage is high enough for havengetallen
if min_coverage is not None:
# TODO: compute_actual_counts only returns years for which there are no nans, so will have different length than expected counts if there is an all-nan year
# TODO: if we supply 4 years of complete data instead of 10 years, no error is raised
df_actual_counts_peryear = compute_actual_counts(df_ext, freq="Y")
df_expected_counts_peryear = compute_expected_counts(df_ext, freq="Y")
df_min_counts_peryear = df_expected_counts_peryear * min_coverage
bool_coverage_toolow = df_actual_counts_peryear < df_min_counts_peryear
df_debug = pd.DataFrame({'#required':df_min_counts_peryear,
'#actual':df_actual_counts_peryear,
'too little':bool_coverage_toolow,
})
if bool_coverage_toolow.any():
raise ValueError(f"coverage of some years is lower than "
f"min_coverage={min_coverage}:\n{df_debug}")

current_station = df_ext.attrs["station"]
logger.info(f'computing havengetallen for {current_station}')
# TODO: check culm_addtime and HWLWno+4 offsets. culm_addtime could also be 2 days or 2days +1h GMT-MET correction. 20 minutes seems odd since moonculm is about tidal wave from ocean
Expand Down
22 changes: 10 additions & 12 deletions kenmerkendewaarden/slotgemiddelden.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@


def calc_slotgemiddelden(df_meas: pd.DataFrame, df_ext: pd.DataFrame=None,
only_valid: pd.Timestamp = False, clip_physical_break: bool = False,
min_coverage:float = None, clip_physical_break: bool = False,
with_nodal: bool = True):
"""
Compute slotgemiddelden from measurement timeseries and optionally also from extremes timeseries.
Expand All @@ -30,8 +30,8 @@ def calc_slotgemiddelden(df_meas: pd.DataFrame, df_ext: pd.DataFrame=None,
the timeseries of measured waterlevels.
df_ext : pd.DataFrame, optional
the timeseries of extremes (high and low waters). The default is None.
only_valid : pd.Timestamp, optional
Whether to set yearly means to nans for years that do not have sufficient data coverage. The default is False.
min_coverage : float, optional
Set yearly means to nans for years that do not have sufficient data coverage. The default is None.
clip_physical_break : bool, optional
Whether to exclude the part of the timeseries before physical breaks like estuary closures. The default is False.
with_nodal : bool, optional
Expand All @@ -44,13 +44,6 @@ def calc_slotgemiddelden(df_meas: pd.DataFrame, df_ext: pd.DataFrame=None,
"""
# TODO: assert if station of both timeseries is the same (if df_ext is present)
# TODO: prevent hardcoded min_count argument for tidalindicators functions: https://github.com/Deltares-research/kenmerkendewaarden/issues/58
if only_valid:
min_count_ext = 1400 # 2*24*365/12.42=1410.6 (12.42 hourly extreme)
min_count_wl = 2900 # 24*365=8760 (hourly interval), 24/3*365=2920 (3-hourly interval)
else:
min_count_ext = None
min_count_wl = None

# initialize dict
slotgemiddelden_dict = {}
Expand All @@ -60,8 +53,11 @@ def calc_slotgemiddelden(df_meas: pd.DataFrame, df_ext: pd.DataFrame=None,
df_meas = df_meas.iloc[:-1]

# calculate yearly means
dict_wltidalindicators = calc_wltidalindicators(df_meas, min_count=min_count_wl)
dict_wltidalindicators = calc_wltidalindicators(df_meas, min_coverage=min_coverage)
wl_mean_peryear = dict_wltidalindicators['wl_mean_peryear']
# convert periodindex to datetimeindex
# TODO: alternatively let fit_models support periodindex
wl_mean_peryear.index = wl_mean_peryear.index.to_timestamp()
slotgemiddelden_dict["wl_mean_peryear"] = wl_mean_peryear

# clip part of mean timeseries before physical break to supply to model
Expand All @@ -78,9 +74,11 @@ def calc_slotgemiddelden(df_meas: pd.DataFrame, df_ext: pd.DataFrame=None,
df_ext = df_ext.iloc[:-1]

# calculate yearly means
dict_HWLWtidalindicators = calc_HWLWtidalindicators(df_ext, min_count=min_count_ext)
dict_HWLWtidalindicators = calc_HWLWtidalindicators(df_ext, min_coverage=min_coverage)
HW_mean_peryear = dict_HWLWtidalindicators['HW_mean_peryear']
LW_mean_peryear = dict_HWLWtidalindicators['LW_mean_peryear']
HW_mean_peryear.index = HW_mean_peryear.index.to_timestamp()
LW_mean_peryear.index = LW_mean_peryear.index.to_timestamp()
slotgemiddelden_dict["HW_mean_peryear"] = HW_mean_peryear
slotgemiddelden_dict["LW_mean_peryear"] = LW_mean_peryear

Expand Down
Loading

0 comments on commit caaf77c

Please sign in to comment.