diff --git a/docs/whats-new.md b/docs/whats-new.md index 8c88e97..422ab8e 100644 --- a/docs/whats-new.md +++ b/docs/whats-new.md @@ -4,6 +4,7 @@ ### Feat - expanded physical_break_dict in [#151](https://github.com/Deltares-research/kenmerkendewaarden/pull/151) +- linear fit for slotgemiddelden (no nodal) in [#157](https://github.com/Deltares-research/kenmerkendewaarden/pull/157) ## 0.3.0 (2024-10-01) diff --git a/kenmerkendewaarden/slotgemiddelden.py b/kenmerkendewaarden/slotgemiddelden.py index 5ea5680..b15de8c 100644 --- a/kenmerkendewaarden/slotgemiddelden.py +++ b/kenmerkendewaarden/slotgemiddelden.py @@ -27,10 +27,13 @@ def calc_slotgemiddelden( df_ext: pd.DataFrame = None, min_coverage: float = None, clip_physical_break: bool = False, - with_nodal: bool = True, ): """ Compute slotgemiddelden from measurement timeseries and optionally also from extremes timeseries. + A simple linear trend is used to avoid all pretend-accuracy. However, when fitting a + linear trend on a limited amount of data, the nodal cycle and wind effects will cause + the model fit to be inaccurate. It is wise to use at least 30 years of data for + a valid fit, this is >1.5 times the nodal cycle. Parameters ---------- @@ -42,8 +45,6 @@ def calc_slotgemiddelden( Set yearly means to nans for years that do not have sufficient data coverage. The default is None. clip_physical_break : bool, optional Whether to exclude the part of the timeseries before physical breaks like estuary closures. The default is False. - with_nodal : bool, optional - Whether to include a nodal cycle in the linear trend model. The default is True. Returns ------- @@ -71,7 +72,7 @@ def calc_slotgemiddelden( wl_mean_peryear = clip_timeseries_physical_break(wl_mean_peryear) # fit linear models over yearly mean values - pred_pd_wl = fit_models(wl_mean_peryear, with_nodal=with_nodal) + pred_pd_wl = predict_linear_model(wl_mean_peryear) slotgemiddelden_dict["wl_model_fit"] = pred_pd_wl if df_ext is not None: @@ -103,9 +104,9 @@ def calc_slotgemiddelden( tidalrange_mean_peryear = clip_timeseries_physical_break(tidalrange_mean_peryear) # fit linear models over yearly mean values - pred_pd_HW = fit_models(HW_mean_peryear, with_nodal=with_nodal) - pred_pd_LW = fit_models(LW_mean_peryear, with_nodal=with_nodal) - pred_pd_tidalrange = fit_models(tidalrange_mean_peryear, with_nodal=with_nodal) + pred_pd_HW = predict_linear_model(HW_mean_peryear) + pred_pd_LW = predict_linear_model(LW_mean_peryear) + pred_pd_tidalrange = predict_linear_model(tidalrange_mean_peryear) slotgemiddelden_dict["HW_model_fit"] = pred_pd_HW slotgemiddelden_dict["LW_model_fit"] = pred_pd_LW slotgemiddelden_dict["tidalrange_model_fit"] = pred_pd_tidalrange @@ -210,13 +211,13 @@ def plot_slotgemiddelden( return fig, ax -def fit_models(mean_array_todate: pd.Series, with_nodal=True) -> pd.DataFrame: +def predict_linear_model(ser: pd.Series, with_nodal=False) -> pd.DataFrame: """ Fit linear model over yearly means in mean_array_todate, including five years in the future. Parameters ---------- - mean_array_todate : pd.Series + ser : pd.Series DESCRIPTION. Returns @@ -226,61 +227,52 @@ def fit_models(mean_array_todate: pd.Series, with_nodal=True) -> pd.DataFrame: """ - start = mean_array_todate.index.min() - end = mean_array_todate.index.max() + 1 + start = ser.index.min() + end = ser.index.max() + 1 - logger.info(f"fit linear model (with_nodal={with_nodal}) for {start} to {end}") + logger.info(f"fit linear model for {start} to {end}") - # We'll just use the years. This assumes that annual waterlevels are used that are stored left-padded, the mean waterlevel for 2020 is stored as 2020-1-1. This is not logical, but common practice. + # generate contiguous timeseries including gaps and including slotgemiddelde year allyears_dt = pd.period_range(start=start, end=end) - mean_array_allyears = pd.Series(mean_array_todate, index=allyears_dt) + ser_allyears = pd.Series(ser, index=allyears_dt) - mean_array_allyears_nonans = mean_array_allyears.loc[~mean_array_allyears.isnull()] - if len(mean_array_allyears_nonans) < 2: + ser_nonans = ser_allyears.loc[~ser_allyears.isnull()] + if len(ser_nonans) < 2: raise ValueError( - f"nan-filtered timeseries has only one timestep, cannot perform model fit:\n{mean_array_allyears_nonans}" + f"nan-filtered timeseries has only one timestep, cannot perform model fit:\n{ser_nonans}" ) - # convert to dataframe of expected format - # TODO: make functions accept mean_array instead of df as argument? - df = pd.DataFrame( - {"year": mean_array_allyears.index.year, "height": mean_array_allyears.values} - ) - - # below methods are copied from https://github.com/openearth/sealevel/blob/master/slr/slr/models.py - # TODO: install slr package as dependency or keep separate? - fit, _, X = linear_model(df, with_wind=False, with_nodal=with_nodal) - pred_linear = fit.predict(X) + # get model fit + fit, _, X = fit_linear_model(ser_allyears, with_nodal=with_nodal) - linear_fit = pd.Series(pred_linear, index=allyears_dt, name="values") - linear_fit.index.name = mean_array_todate.index.name - return linear_fit + # predict + pred_arr = fit.predict(X) + pred_pd = pd.Series(pred_arr, index=allyears_dt, name="values") + pred_pd.index.name = ser.index.name + return pred_pd -# copied from https://github.com/openearth/sealevel/blob/master/slr/slr/models.py -def linear_model(df, with_wind=True, with_ar=True, with_nodal=True, quantity="height"): - """Define the linear model with optional wind and autoregression. - See the latest report for a detailed description. +def fit_linear_model(df, with_nodal=False): + """ + Define the linear model with constant and trend, optionally with nodal. + simplified from https://github.com/openearth/sealevel/blob/master/slr/slr/models.py """ - y = df[quantity] - X = np.c_[df["year"] - 1970,] - # month = np.mod(df['year'], 1) * 12.0 + # just use the years. This assumes that annual waterlevels are used that are + # stored left-padded, the mean waterlevel for 2020 is stored as 2020-1-1 + # This is not logical, but common practice. + years = df.index.year + y = df.values + X = np.c_[years - 1970,] names = ["Constant", "Trend"] if with_nodal: X = np.c_[ X, - np.cos(2 * np.pi * (df["year"] - 1970) / 18.613), - np.sin(2 * np.pi * (df["year"] - 1970) / 18.613), + np.cos(2 * np.pi * (years - 1970) / 18.613), + np.sin(2 * np.pi * (years - 1970) / 18.613), ] names.extend(["Nodal U", "Nodal V"]) - if with_wind: - X = np.c_[X, df["u2"], df["v2"]] - names.extend(["Wind $u^2$", "Wind $v^2$"]) X = sm.add_constant(X) - if with_ar: - model = sm.GLSAR(y, X, missing="drop", rho=1) - else: - model = sm.OLS(y, X, missing="drop") + model = sm.OLS(y, X, missing="drop") fit = model.fit(cov_type="HC0") return fit, names, X diff --git a/tests/test_slotgemiddelden.py b/tests/test_slotgemiddelden.py index d0b134d..a5f7725 100644 --- a/tests/test_slotgemiddelden.py +++ b/tests/test_slotgemiddelden.py @@ -21,29 +21,30 @@ def test_calc_slotgemiddelden_outputtype(df_meas_2010_2014, df_ext_12_2010_2014) @pytest.mark.unittest -def test_fit_models(df_meas_2010_2014): +def test_predict_linear_model(df_meas_2010_2014): dict_wltidalindicators_valid = kw.calc_wltidalindicators( df_meas_2010_2014 ) # 24*365=8760 (hourly interval), 24/3*365=2920 (3-hourly interval) wl_mean_peryear_valid = dict_wltidalindicators_valid["wl_mean_peryear"] - wl_model_fit_nodal = kw.slotgemiddelden.fit_models( + wl_model_fit_nodal = kw.slotgemiddelden.predict_linear_model( wl_mean_peryear_valid, with_nodal=True ) nodal_expected = np.array( - [0.0141927, 0.08612119, 0.0853051, 0.07010864, 0.10051922, 0.23137634] + [0.07860955, 0.08999961, 0.07954378, 0.07398706, 0.09952146, 0.17882958] ) assert np.allclose(wl_model_fit_nodal.values, nodal_expected) - - wl_model_fit_linear = kw.slotgemiddelden.fit_models( + + wl_model_fit_linear = kw.slotgemiddelden.predict_linear_model( wl_mean_peryear_valid, with_nodal=False ) linear_expected = np.array( - [0.07851414, 0.0813139, 0.08411366, 0.08691342, 0.08971318, 0.09251294] + [0.07917004, 0.08175116, 0.08433229, 0.08691342, 0.08949454, 0.09207567] ) assert np.allclose(wl_model_fit_linear.values, linear_expected) + @pytest.mark.unittest def test_calc_slotgemiddelden(df_meas_2010_2014, df_ext_12_2010_2014): slotgemiddelden_dict_inclext = kw.calc_slotgemiddelden( @@ -94,14 +95,14 @@ def test_calc_slotgemiddelden(df_meas_2010_2014, df_ext_12_2010_2014): ) # fmt: off - wl_model_fit_expected = np.array([0.0141927, 0.08612119, 0.0853051, - 0.07010864, 0.10051922, 0.23137634]) - hw_model_fit_expected = np.array([1.05295416, 1.12875177, 1.13988685, - 1.1415461, 1.18998584, 1.336182]) - lw_model_fit_expected = np.array([-0.67420399, -0.59089362, -0.59342291, - -0.61334278, -0.58024113, -0.42969074]) - range_model_fit_expected = np.array([1.72715816, 1.71964539, 1.73330976, - 1.75488888, 1.77022697, 1.76587273]) + wl_model_fit_expected = np.array([0.07917004, 0.08175116, 0.08433229, + 0.08691342, 0.08949454, 0.09207567]) + hw_model_fit_expected = np.array([1.12529394, 1.13663287, 1.14797179, + 1.15931071, 1.17064963, 1.18198856]) + lw_model_fit_expected = np.array([-0.60236402, -0.59953375, -0.59670349, + -0.59387323, -0.59104297, -0.58821271]) + range_model_fit_expected = np.array([1.72765796, 1.73616662, 1.74467528, + 1.75318394, 1.7616926, 1.77020126]) # fmt: on assert np.allclose( slotgemiddelden_dict_inclext["wl_model_fit"].values, wl_model_fit_expected