Skip to content

Commit

Permalink
49 add testcase for data retrieve (#50)
Browse files Browse the repository at this point in the history
* removed boxplot

* added tests for data_retrieve and data_analysis

* remove warning catching, it raised errors on py39 and is not necessary

* fixed timedelta for older pandas versions
  • Loading branch information
veenstrajelmer authored Jun 10, 2024
1 parent 90d48ff commit 56d4308
Show file tree
Hide file tree
Showing 4 changed files with 90 additions and 27 deletions.
5 changes: 0 additions & 5 deletions examples/KWK_getcheckdata.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,11 +83,6 @@
fig, ax = kw.df_amount_pcolormesh(df_amount_ext, relative=True)
fig.savefig(file_plot + "_ext_pcolormesh_relative", dpi=200)

fig, ax = kw.df_amount_boxplot(df_amount_ts)
fig.savefig(file_plot + "_ts_boxplot", dpi=200)
fig, ax = kw.df_amount_boxplot(df_amount_ext)
fig.savefig(file_plot + "_ext_boxplot", dpi=200)



### RETRIEVE DATA FROM DDL AND WRITE TO NETCDF
Expand Down
17 changes: 2 additions & 15 deletions kenmerkendewaarden/data_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@
import logging

__all__ = [
"df_amount_boxplot",
"df_amount_pcolormesh",
"plot_measurements",
"derive_statistics",
Expand All @@ -20,18 +19,6 @@
logger = logging.getLogger(__name__)


def df_amount_boxplot(df):
df = df.copy()
df.index = pd.to_datetime(df.index)
df[df==0] = np.nan

fig, ax = plt.subplots(figsize=(14,8))
df.plot.box(ax=ax, rot=90, grid=True)
ax.set_ylabel("measurements per year (0 excluded) [-]")
fig.tight_layout()
return fig, ax


def df_amount_pcolormesh(df, relative=False):
df = df.copy()
df[df==0] = np.nan
Expand Down Expand Up @@ -120,8 +107,8 @@ def get_flat_meta_from_dataset(ds):
def get_stats_from_dataframe(df):
df_times = df.index
ts_dupltimes = df_times.duplicated()
ts_timediff = df_times.diff()[1:]
ts_timediff = df_times[1:]-df_times[:-1] # from pandas 2.2.0 the following also works: df_times.diff()[1:]

ds_stats = {}
ds_stats['tstart'] = df_times.min()
ds_stats['tstop'] = df_times.max()
Expand Down
11 changes: 4 additions & 7 deletions kenmerkendewaarden/data_retrieve.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
import os
import pandas as pd
import ddlpy
import warnings
from pyproj import Transformer
import pooch
import logging
Expand Down Expand Up @@ -45,7 +44,7 @@ def retrieve_catalog(overwrite=False, crs:int = None):
catalog_filter = ['Eenheden','Grootheden','Hoedanigheden','Groeperingen','Parameters','Compartimenten','Typeringen']
locations_full = ddlpy.locations(catalog_filter=catalog_filter)
drop_columns = [x for x in locations_full.columns if x.endswith(".Omschrijving")]
drop_columns.append("Parameter_Wat_Omschrijving")
# drop_columns.append("Parameter_Wat_Omschrijving") # TODO: uncomment after ddlpy 0.6.0 is released: https://github.com/Deltares/ddlpy/pull/104
locations = locations_full.drop(columns=drop_columns)
pd.to_pickle(locations, file_catalog_pkl)

Expand Down Expand Up @@ -116,18 +115,16 @@ def retrieve_measurements_amount(dir_output, station_list, extremes:bool, start_
# TODO: no ext station available for ["A12","AWGPFM","BAALHK","GATVBSLE","D15","F16","F3PFM","J6","K14PFM",
# "L9PFM","MAASMSMPL","NORTHCMRT","OVLVHWT","Q1","SINTANLHVSGR","WALSODN"]
# https://github.com/Rijkswaterstaat/wm-ws-dl/issues/39
amount_ext = pd.DataFrame({station:[]})
amount_ext.index.name = "Groeperingsperiode"
amount_meas = pd.DataFrame({station:[]}, dtype='int64')
amount_meas.index.name = "Groeperingsperiode"
else:
amount_meas = ddlpy.measurements_amount(location=loc_meas_one.iloc[0], start_date=start_date, end_date=end_date)
amount_meas = amount_meas.rename(columns={"AantalMetingen":station})

amount_list.append(amount_meas)

logger.info(f'write measurement amount csvs to {os.path.basename(dir_output)}')
with warnings.catch_warnings(action="ignore", category=FutureWarning):
# to suppress "FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. "
df_amount = pd.concat(amount_list, axis=1).sort_index()
df_amount = pd.concat(amount_list, axis=1).sort_index()
df_amount = df_amount.fillna(0).astype(int)

df_amount.to_csv(file_csv_amount)
Expand Down
84 changes: 84 additions & 0 deletions tests/test_data_retrieve.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
# -*- coding: utf-8 -*-

import pytest
import kenmerkendewaarden as kw
import numpy as np
import pandas as pd


@pytest.mark.timeout(60) # useful in case of ddl failure
@pytest.mark.systemtest
@pytest.mark.parametrize("extremes", [False,True], ids=["timeseries", "extremes"])
def test_retrieve_read_measurements_amount(tmp_path, extremes):
start_date = pd.Timestamp(2010,11,1, tz="UTC+01:00")
end_date = pd.Timestamp(2011,2,1, tz="UTC+01:00")
station_list = ["HOEKVHLD"]

kw.retrieve_measurements_amount(dir_output=tmp_path, station_list=station_list,
start_date=start_date, end_date=end_date,
extremes=extremes)

df_amount = kw.read_measurements_amount(dir_output=tmp_path, extremes=extremes)

if extremes:
df_vals = np.array([312, 157])
else:
df_vals = np.array([8784, 4465])
assert len(df_amount) == 2
assert np.allclose(df_amount["HOEKVHLD"].values, df_vals)


@pytest.mark.timeout(60) # useful in case of ddl failure
@pytest.mark.systemtest
@pytest.mark.parametrize("extremes", [False,True], ids=["timeseries", "extremes"])
def test_retrieve_read_measurements_derive_statistics(tmp_path, extremes):
start_date = pd.Timestamp(2010,1,1, tz="UTC+01:00")
end_date = pd.Timestamp(2011,1,1, tz="UTC+01:00")
station_list = ["HOEKVHLD"]
current_station = station_list[0]

# retrieve meas
kw.retrieve_measurements(dir_output=tmp_path, station=current_station, extremes=extremes,
start_date=start_date, end_date=end_date)

# read meas
df_meas = kw.read_measurements(dir_output=tmp_path, station=current_station, extremes=extremes)

if extremes:
df_meas_len = 1863
cols_stats = ['WaarnemingMetadata.StatuswaardeLijst',
'WaarnemingMetadata.KwaliteitswaardecodeLijst',
'WaardeBepalingsmethode.Code', 'MeetApparaat.Code', 'Hoedanigheid.Code',
'Grootheid.Code', 'Groepering.Code', 'Typering.Code', 'tstart', 'tstop',
'timediff_min', 'timediff_max', 'nvals', '#nans', 'min', 'max', 'std',
'mean', 'dupltimes', 'dupltimes_#nans', 'qc_none', 'timediff<4hr',
'aggers']
stats_expected = np.array([0.07922705314009662, -1.33, 2.11])
timedif_min = pd.Timedelta('0 days 00:34:00')
timedif_max = pd.Timedelta('0 days 08:57:00')
else:
df_meas_len = 52561
cols_stats = ['WaarnemingMetadata.StatuswaardeLijst',
'WaarnemingMetadata.KwaliteitswaardecodeLijst',
'WaardeBepalingsmethode.Code', 'MeetApparaat.Code', 'Hoedanigheid.Code',
'Grootheid.Code', 'Groepering.Code', 'Typering.Code', 'tstart', 'tstop',
'timediff_min', 'timediff_max', 'nvals', '#nans', 'min', 'max', 'std',
'mean', 'dupltimes', 'dupltimes_#nans', 'qc_none']
stats_expected = np.array([0.07962614866536023, -1.33, 2.11])
timedif_min = pd.Timedelta('0 days 00:10:00')
timedif_max = pd.Timedelta('0 days 00:10:00')

# assert amount of measurements, this might change if ddl data is updated
assert len(df_meas) == df_meas_len

stats = kw.derive_statistics(dir_output=tmp_path, station_list=station_list, extremes=extremes)

# assert statistics columns
assert set(stats.columns) == set(cols_stats)

# assert statistics values, this might change if ddl data is updated
stats_vals = stats.loc[current_station, ["mean","min","max"]].values.astype(float)
assert np.allclose(stats_vals, stats_expected)

assert stats.loc[current_station, "timediff_min"] == timedif_min
assert stats.loc[current_station, "timediff_max"] == timedif_max

0 comments on commit 56d4308

Please sign in to comment.