From 36ed77a4ce7345da5f96c76d6c161d24b8e0c7f4 Mon Sep 17 00:00:00 2001
From: veenstrajelmer <60435591+veenstrajelmer@users.noreply.github.com>
Date: Mon, 24 Jun 2024 14:46:34 +0200
Subject: [PATCH] 94 attach station attribute (#96)

* pass on attrs and removed station argument from `plot_gemiddeldgetij()`

* add comparison of station attrs in several functions

* simplified plot_tidalindicators and add compare for station attrs

* fixed logging
---
 examples/KWK_getcheckdata.py          |  2 +-
 examples/KWK_process.py               | 43 ++++++++++++++-------------
 kenmerkendewaarden/gemiddeldgetij.py  | 32 ++++++++++++--------
 kenmerkendewaarden/havengetallen.py   |  2 +-
 kenmerkendewaarden/overschrijding.py  |  2 +-
 kenmerkendewaarden/slotgemiddelden.py | 13 +++++---
 kenmerkendewaarden/tidalindicators.py | 24 ++++++---------
 tests/test_tidalindicators.py         |  3 +-
 8 files changed, 65 insertions(+), 56 deletions(-)

diff --git a/examples/KWK_getcheckdata.py b/examples/KWK_getcheckdata.py
index e74dc75..a109a45 100644
--- a/examples/KWK_getcheckdata.py
+++ b/examples/KWK_getcheckdata.py
@@ -9,9 +9,9 @@
 
 # set logging level to INFO to get log messages
 import logging
+logging.basicConfig() # calling basicConfig is essential to set logging level for sub-modules
 logging.getLogger("kenmerkendewaarden").setLevel(level="INFO")
 
-
 # TODO: overview of data improvements: https://github.com/Deltares-research/kenmerkendewaarden/issues/29
 # TODO: overview of data issues in https://github.com/Deltares-research/kenmerkendewaarden/issues/4
 # TODO: missings/duplicates reported in https://github.com/Rijkswaterstaat/wm-ws-dl/issues/39. Some of the duplicates are not retrieved since we use clean_df in ddlpy
diff --git a/examples/KWK_process.py b/examples/KWK_process.py
index 620698d..f829c95 100644
--- a/examples/KWK_process.py
+++ b/examples/KWK_process.py
@@ -10,6 +10,7 @@
 
 # set logging level to INFO to get log messages
 import logging
+logging.basicConfig() # calling basicConfig is essential to set logging level for sub-modules
 logging.getLogger("kenmerkendewaarden").setLevel(level="INFO")
 
 # TODO: HW/LW numbers not always increasing (at havengetallen): ['HANSWT','BROUWHVSGT08','PETTZD','DORDT']
@@ -48,9 +49,9 @@
                 'MARLGT','NES','NIEUWSTZL','NORTHCMRT','DENOVBTN','OOSTSDE04','OOSTSDE11','OOSTSDE14','OUDSD','OVLVHWT','Q1',
                 'ROOMPBNN','ROOMPBTN','SCHAARVDND','SCHEVNGN','SCHIERMNOG','SINTANLHVSGR','STAVNSE','STELLDBTN','TERNZN','TERSLNZE','TEXNZE',
                 'VLAKTVDRN','VLIELHVN','VLISSGN','WALSODN','WESTKPLE','WESTTSLG','WIERMGDN','YERSKE']
+station_list = ["VLISSGN","HOEKVHLD","IJMDBTHVN","HARLGN","DENHDR","DELFZL","SCHIERMNOG","VLIELHVN","STELLDBTN","SCHEVNGN","ROOMPBTN"] # subset of 11 stations along the coast
 # TODO: maybe add from Dillingh 2013: DORDT, MAASMSMPL, PETTZD, ROTTDM
-stat_list = ['HOEKVHLD']#,'HARVT10','VLISSGN']
-
+station_list = ['HOEKVHLD']#,'HARVT10','VLISSGN']
 
 
 nap_correction = False
@@ -62,7 +63,7 @@
 compute_overschrijding = True
 
 
-for current_station in stat_list:
+for current_station in station_list:
     print(f'starting process for {current_station}')
     plt.close('all')
     
@@ -80,8 +81,9 @@
         #crop timeseries to 10y
         data_pd_HWLW_10y_12 = hatyan.crop_timeseries(data_pd_HWLW_all_12, times=slice(tstart_dt,tstop_dt),onlyfull=False)
     
-
-
+    
+    
+    
     #### TIDAL INDICATORS
     if compute_indicators and data_pd_meas_all is not None and data_pd_HWLW_all is not None:
         print(f'tidal indicators for {current_station}')
@@ -95,12 +97,16 @@
         dict_HWLWtidalindicators["hat"] = hat
         dict_HWLWtidalindicators["lat"] = lat
         
+        # merge dictionaries
+        dict_wltidalindicators.update(dict_HWLWtidalindicators)
+        
         # plot
-        fig, ax = kw.plot_tidalindicators(dict_wltidalindicators, dict_HWLWtidalindicators)
+        fig, ax = kw.plot_tidalindicators(dict_wltidalindicators)
         fig.savefig(os.path.join(dir_indicators,f'tidal_indicators_{current_station}'))
-
-
-        
+    
+    
+    
+    
     #### SLOTGEMIDDELDEN
     # TODO: nodal cycle is not in same phase for all stations, this is not physically correct.
     # TODO: more data is needed for proper working of fitting for some stations (2011: BAALHK, BRESKVHVN, GATVBSLE, SCHAARVDND)
@@ -153,11 +159,12 @@
         
         df_havengetallen, data_pd_HWLW = kw.calc_havengetallen(df_ext=data_pd_HWLW_10y_12, return_df_ext=True)
         
-        print('HWLW FIGUREN PER TIJDSKLASSE, INCLUSIEF MEDIAN LINE')
+        print(f'havengetallen for {current_station}')
+        # plot hwlw per timeclass including median
         fig, axs = kw.plot_HWLW_pertimeclass(data_pd_HWLW, df_havengetallen)
         fig.savefig(os.path.join(dir_havget,f'HWLW_pertijdsklasse_inclmedianline_{current_station}'))
         
-        print('AARDAPPELGRAFIEK')
+        # plot aardappelgrafiek
         fig, (ax1,ax2) = kw.plot_aardappelgrafiek(df_havengetallen)
         fig.savefig(os.path.join(dir_havget, f'aardappelgrafiek_{year_slotgem}_{current_station}'))
         
@@ -169,11 +176,10 @@
     
     
     
-    
     ##### GEMIDDELDE GETIJKROMMEN
     if compute_gemgetij and data_pd_meas_all is not None and data_pd_HWLW_all is not None:
         
-        print(f'gem getijkrommen for {current_station}')
+        print(f'gemiddelde getijkrommen for {current_station}')
         pred_freq = "10s" # frequency influences the accuracy of havengetallen-scaling and is writing frequency of BOI timeseries
         
         # derive getijkrommes: raw, scaled to havengetallen, scaled to havengetallen and 12h25min period
@@ -192,11 +198,11 @@
             file_boi_csv = os.path.join(dir_gemgetij, f'Getijkromme_BOI_{key}_{current_station}_slotgem{year_slotgem}.csv')
             gemgetij_corr_boi[key].to_csv(file_boi_csv, float_format='%.3f')
         
-        fig_sum, ax_sum = kw.plot_gemiddeldgetij(gemgetij_dict=gemgetij_corr, gemgetij_dict_raw=gemgetij_raw, station=current_station, tick_hours=6)
+        fig_sum, ax_sum = kw.plot_gemiddeldgetij(gemgetij_dict=gemgetij_corr, gemgetij_dict_raw=gemgetij_raw, tick_hours=6)
         fig_sum.savefig(os.path.join(dir_gemgetij,f'gemgetij_trefHW_{current_station}'))
         
-        print(f'plot BOI figure and compare to KW2020: {current_station}')
-        fig_boi, ax1_boi = kw.plot_gemiddeldgetij(gemgetij_dict=gemgetij_corr_boi, station=current_station, tick_hours=12)
+        # plot BOI figure and compare to KW2020
+        fig_boi, ax1_boi = kw.plot_gemiddeldgetij(gemgetij_dict=gemgetij_corr_boi, tick_hours=12)
         
         # plot validation lines if available
         # TODO: these index of this line is converted from datetimes to timedeltas to get it in the same plot
@@ -225,7 +231,6 @@
     
     
     
-    
     ###OVERSCHRIJDINGSFREQUENTIES
     # TODO: SLR trend correctie voor overschrijdingsfrequenties en evt ook voor andere KW?
     # TODO: resulting freqs seem to be shifted w.r.t. getijtafelboekje (mail PH 9-3-2022)
@@ -246,7 +251,6 @@
         if current_station =='HOEKVHLD':
             dir_vali_overschr = os.path.join(dir_base,'data_overschrijding') # TODO: this data is not reproducible yet
             stat_name = 'Hoek_van_Holland'
-            print('Load Hydra-NL distribution data and other validation data')
             dist_vali_exc = {}
             dist_vali_exc['Hydra-NL'] = pd.read_csv(os.path.join(dir_vali_overschr,'Processed_HydraNL','Without_model_uncertainty',f'{stat_name}.csv'), sep=';', header=[0])
             dist_vali_exc['Hydra-NL']['values'] /= 100 # cm to m
@@ -262,7 +266,6 @@
                 dist_vali_dec['validation']['values'] /= 100
         
         # 1. Exceedance
-        print('Exceedance')
         dist_exc = kw.calc_overschrijding(df_ext=data_pd_measext, rule_type=None, rule_value=None, 
                                           clip_physical_break=True, dist=dist_vali_exc,
                                           interp_freqs=Tfreqs_interested)
@@ -274,7 +277,6 @@
         fig.savefig(os.path.join(dir_overschrijding, f'Exceedance_lines_{current_station}.png'))
         
         # 2. Deceedance
-        print('Deceedance')
         dist_dec = kw.calc_overschrijding(df_ext=data_pd_measext, rule_type=None, rule_value=None, 
                                           clip_physical_break=True, dist=dist_vali_dec, inverse=True,
                                           interp_freqs=Tfreqs_interested)
@@ -283,4 +285,3 @@
         
         fig, ax = kw.plot_overschrijding(dist_dec)
         fig.savefig(os.path.join(dir_overschrijding, f'Deceedance_lines_{current_station}.png'))
-
diff --git a/kenmerkendewaarden/gemiddeldgetij.py b/kenmerkendewaarden/gemiddeldgetij.py
index a5da5b9..127453c 100644
--- a/kenmerkendewaarden/gemiddeldgetij.py
+++ b/kenmerkendewaarden/gemiddeldgetij.py
@@ -76,6 +76,10 @@ def calc_gemiddeldgetij(df_meas: pd.DataFrame, df_ext: pd.DataFrame = None, min_
     if scale_extremes:
         if df_ext is None:
             raise ValueError("df_ext should be provided if scale_extremes=True")
+        # compare station attributes
+        station_attrs = [df.attrs["station"] for df in [df_meas, df_ext]]
+        assert all(x == station_attrs[0] for x in station_attrs)
+
         df_havengetallen = calc_havengetallen(df_ext=df_ext, min_coverage=min_coverage)
         HW_sp, LW_sp = df_havengetallen.loc[0,['HW_values_median','LW_values_median']] # spring
         HW_np, LW_np = df_havengetallen.loc[6,['HW_values_median','LW_values_median']] # neap
@@ -90,14 +94,14 @@ def calc_gemiddeldgetij(df_meas: pd.DataFrame, df_ext: pd.DataFrame = None, min_
     
     times_pred_1mnth = pd.date_range(start=pd.Timestamp(tstop_dt.year,1,1,0,0)-pd.Timedelta(hours=12), end=pd.Timestamp(tstop_dt.year,2,1,0,0), freq=freq) #start 12 hours in advance, to assure also corrected values on desired tstart
     comp_av.attrs['nodalfactors'] = False #nodalfactors=False to guarantee repetative signal
-    prediction_av = hatyan.prediction(comp_av, times=times_pred_1mnth)
-    prediction_av_ext = hatyan.calc_HWLW(ts=prediction_av, calc_HWLW345=False)
+    prediction_avg = hatyan.prediction(comp_av, times=times_pred_1mnth)
+    prediction_avg_ext = hatyan.calc_HWLW(ts=prediction_avg, calc_HWLW345=False)
     
-    time_firstHW = prediction_av_ext.loc[prediction_av_ext['HWLWcode']==1].index[0] #time of first HW
-    ia1 = prediction_av_ext.loc[time_firstHW:].index[0] #time of first HW
-    ia2 = prediction_av_ext.loc[time_firstHW:].index[2] #time of second HW
-    prediction_av_one = prediction_av.loc[ia1:ia2]
-    prediction_av_ext_one = prediction_av_ext.loc[ia1:ia2]
+    time_firstHW = prediction_avg_ext.loc[prediction_avg_ext['HWLWcode']==1].index[0] #time of first HW
+    ia1 = prediction_avg_ext.loc[time_firstHW:].index[0] #time of first HW
+    ia2 = prediction_avg_ext.loc[time_firstHW:].index[2] #time of second HW
+    prediction_avg_one = prediction_avg.loc[ia1:ia2]
+    prediction_avg_ext_one = prediction_avg_ext.loc[ia1:ia2]
     
     # =============================================================================
     # Hatyan predictie voor 1 jaar met gemiddelde helling maansbaan (voor afleiden spring-doodtijcyclus) >> predictie zonder nodalfactors instead
@@ -159,7 +163,7 @@ def calc_gemiddeldgetij(df_meas: pd.DataFrame, df_ext: pd.DataFrame = None, min_
     
     #timeseries for gele boekje (av/sp/np have different lengths, time is relative to HW of av and HW of sp/np are shifted there)
     logger.info(f'reshape_signal GEMGETIJ: {current_station}')
-    prediction_av_corr_one = reshape_signal(prediction_av_one, prediction_av_ext_one, HW_goal=HW_av, LW_goal=LW_av, tP_goal=tP_goal)
+    prediction_av_corr_one = reshape_signal(prediction_avg_one, prediction_avg_ext_one, HW_goal=HW_av, LW_goal=LW_av, tP_goal=tP_goal)
     prediction_av_corr_one.index = prediction_av_corr_one.index - prediction_av_corr_one.index[0] # make relative to first timestamp (=HW)
     if scale_period: # resampling required because of scaling
         prediction_av_corr_one = prediction_av_corr_one.resample(freq).nearest()
@@ -188,7 +192,7 @@ def calc_gemiddeldgetij(df_meas: pd.DataFrame, df_ext: pd.DataFrame = None, min_
     return gemgetij_dict
 
 
-def plot_gemiddeldgetij(gemgetij_dict:dict, gemgetij_dict_raw:dict = None, station:str = None, tick_hours:int = None):
+def plot_gemiddeldgetij(gemgetij_dict:dict, gemgetij_dict_raw:dict = None, tick_hours:int = None):
     """
     Default plotting function for gemiddeldgetij dictionaries.
 
@@ -198,8 +202,6 @@ def plot_gemiddeldgetij(gemgetij_dict:dict, gemgetij_dict_raw:dict = None, stati
         dictionary as returned from `kw.calc_gemiddeldgetij()`.
     gemgetij_raw_dict : dict, optional
         dictionary as returned from `kw.calc_gemiddeldgetij()` e.g. with uncorrected values. The default is None.
-    station : str, optional
-        station name, used in figure title. The default is None.
     ticks_12h : bool, optional
         whether to use xaxis ticks of 12 hours, otherwise automatic but less nice values
 
@@ -211,7 +213,11 @@ def plot_gemiddeldgetij(gemgetij_dict:dict, gemgetij_dict_raw:dict = None, stati
         Figure axis handle.
 
     """
-    # TODO: prevent station argument
+    # get and compare station attributes
+    station_attrs = [v.attrs["station"] for k,v in gemgetij_dict.items()]
+    assert all(x == station_attrs[0] for x in station_attrs)
+    station = station_attrs[0]
+    
     logger.info(f'plot getijkromme trefHW: {station}')
     fig, ax = plt.subplots(figsize=(14,7))
     cmap = plt.get_cmap("tab10")
@@ -376,4 +382,6 @@ def repeat_signal(ts_one_HWtoHW, nb, nf):
                               index=ts_one_HWtoHW.index + iAdd*tidalperiod)
         ts_rep = pd.concat([ts_rep,ts_add])
     ts_rep = ts_rep.loc[~ts_rep.index.duplicated()]
+    # pass on attributes
+    ts_rep.attrs = ts_one_HWtoHW.attrs
     return ts_rep
diff --git a/kenmerkendewaarden/havengetallen.py b/kenmerkendewaarden/havengetallen.py
index 852cd6c..f3e492b 100644
--- a/kenmerkendewaarden/havengetallen.py
+++ b/kenmerkendewaarden/havengetallen.py
@@ -128,7 +128,7 @@ def calc_HWLW_moonculm_combi(data_pd_HWLW_12,culm_addtime=None):
 
 
 def calc_HWLW_culmhr_summary(data_pd_HWLW):
-    logger.info('calculate medians per hour group for LW and HW (instead of 1991 method: average of subgroups with removal of outliers)')
+    logger.info('calculate medians per hour group for LW and HW')
     data_pd_HW = data_pd_HWLW.loc[data_pd_HWLW['HWLWcode']==1]
     data_pd_LW = data_pd_HWLW.loc[data_pd_HWLW['HWLWcode']==2]
     
diff --git a/kenmerkendewaarden/overschrijding.py b/kenmerkendewaarden/overschrijding.py
index e55e0a9..f9b5dbb 100644
--- a/kenmerkendewaarden/overschrijding.py
+++ b/kenmerkendewaarden/overschrijding.py
@@ -75,7 +75,7 @@ def calc_overschrijding(df_ext:pd.DataFrame, dist:dict = None,
     if dist is None:
         dist = {}
     
-    logger.info('Calculate unfiltered distribution')
+    logger.info(f'Calculate unfiltered distribution (inverse={inverse})')
     dist['Ongefilterd'] = distribution(df_extrema_clean, inverse=inverse)
     
     # TODO: re-enable filter for river discharge peaks
diff --git a/kenmerkendewaarden/slotgemiddelden.py b/kenmerkendewaarden/slotgemiddelden.py
index e4e8f93..f95d6e3 100644
--- a/kenmerkendewaarden/slotgemiddelden.py
+++ b/kenmerkendewaarden/slotgemiddelden.py
@@ -43,8 +43,6 @@ def calc_slotgemiddelden(df_meas: pd.DataFrame, df_ext: pd.DataFrame=None,
         dictionary with yearly means and model fits, optionally also for extremes.
 
     """
-    # TODO: assert if station of both timeseries is the same (if df_ext is present)
-    
     # initialize dict
     slotgemiddelden_dict = {}
     
@@ -69,6 +67,10 @@ def calc_slotgemiddelden(df_meas: pd.DataFrame, df_ext: pd.DataFrame=None,
     slotgemiddelden_dict["wl_model_fit"] = pred_pd_wl
     
     if df_ext is not None:
+        # compare station attributes
+        station_attrs = [df.attrs["station"] for df in [df_meas, df_ext]]
+        assert all(x == station_attrs[0] for x in station_attrs)
+        
         # clip last value of the timeseries if this is exactly newyearsday
         if df_ext.index[-1] == pd.Timestamp(df_ext.index[-1].year,1,1, tz=df_ext.index.tz):
             df_ext = df_ext.iloc[:-1]
@@ -106,7 +108,7 @@ def plot_slotgemiddelden(slotgemiddelden_dict:dict, slotgemiddelden_dict_all:dic
         Output from `kw.calc_slotgemiddelden` containing timeseries 
         of yearly mean waterlevels and corresponding model fits.
     slotgemiddelden_dict_all : dict, optional
-        Optionally provide another dictionary with unfiltered mean waterlevls. 
+        Optionally provide another dictionary with unfiltered mean waterlevels. 
         Only used to plot the mean waterlevels (in grey). The default is None.
 
     Returns
@@ -117,7 +119,6 @@ def plot_slotgemiddelden(slotgemiddelden_dict:dict, slotgemiddelden_dict_all:dic
         Figure axis handle.
     
     """
-    # TODO: maybe add an escape for if the station attr is not present
     station = slotgemiddelden_dict['wl_mean_peryear'].attrs["station"]
     
     fig, ax = plt.subplots(figsize=(12,6))
@@ -136,6 +137,10 @@ def plot_slotgemiddelden(slotgemiddelden_dict:dict, slotgemiddelden_dict_all:dic
     
     # plot timeseries of average extremes
     if slotgemiddelden_dict_all is not None:
+        # compare station attributes
+        station_attrs = [dic['wl_mean_peryear'].attrs["station"] for dic in [slotgemiddelden_dict, slotgemiddelden_dict_all]]
+        assert all(x == station_attrs[0] for x in station_attrs)
+
         if "HW_mean_peryear" in slotgemiddelden_dict_all.keys():
             HW_mean_peryear_all = slotgemiddelden_dict_all["HW_mean_peryear"]
             LW_mean_peryear_all = slotgemiddelden_dict_all["LW_mean_peryear"]
diff --git a/kenmerkendewaarden/tidalindicators.py b/kenmerkendewaarden/tidalindicators.py
index 9dbce88..e1373f2 100644
--- a/kenmerkendewaarden/tidalindicators.py
+++ b/kenmerkendewaarden/tidalindicators.py
@@ -227,16 +227,14 @@ def plot_pd_series(indicators_dict, ax):
         ax.hlines(value, xmin, xmax, linestyle="--", color="k", label=key, zorder=1)
 
 
-def plot_tidalindicators(indicators_wl:dict = None, indicators_ext = None):
+def plot_tidalindicators(dict_indicators:dict):
     """
     plot tidalindicators
 
     Parameters
     ----------
-    indicators_wl : dict, optional
-        Dictionary as returned from `kw.calc_wltidalindicators()`. The default is None.
-    indicators_ext : TYPE, optional
-        Dictionary as returned from `kw.calc_HWLWtidalindicators()`. The default is None.
+    dict_indicators : dict, optional
+        Dictionary as returned from `kw.calc_wltidalindicators()` and/or `kw.calc_HWLWtidalindicators()`. The default is None.
 
     Returns
     -------
@@ -247,17 +245,13 @@ def plot_tidalindicators(indicators_wl:dict = None, indicators_ext = None):
 
     """
         
+    # get and compare station attributes
+    station_attrs = [v.attrs["station"] for k,v in dict_indicators.items() if hasattr(v, "attrs")]
+    assert all(x == station_attrs[0] for x in station_attrs)
+    station = station_attrs[0]
+
     fig, ax = plt.subplots(figsize=(12,6))
-        
-    if indicators_wl is not None:
-        # TODO: maybe add an escape for if the station attr is not present
-        station = indicators_wl['wl_mean_peryear'].attrs["station"]
-        plot_pd_series(indicators_wl, ax)
-    if indicators_ext is not None:
-        # TODO: maybe add an escape for if the station attr is not present
-        station = indicators_ext['HW_mean_peryear'].attrs["station"]
-        plot_pd_series(indicators_ext, ax)
-    
+    plot_pd_series(dict_indicators, ax)
     ax.grid()
     ax.legend(loc=1)
     ax.set_ylabel("water level [m]")
diff --git a/tests/test_tidalindicators.py b/tests/test_tidalindicators.py
index 5eb5af4..42e2ed9 100644
--- a/tests/test_tidalindicators.py
+++ b/tests/test_tidalindicators.py
@@ -161,7 +161,8 @@ def test_calc_wltidalindicators(df_ext_12_2010_2014):
 def test_plot_wltidalindicators(df_meas_2010_2014, df_ext_12_2010_2014):
     wl_stats = kw.calc_wltidalindicators(df_meas_2010_2014)
     ext_stats = kw.calc_HWLWtidalindicators(df_ext_12_2010_2014)
-    kw.plot_tidalindicators(wl_stats, ext_stats)
+    wl_stats.update(ext_stats)
+    kw.plot_tidalindicators(wl_stats)
 
 
 @pytest.mark.unittest