-
Notifications
You must be signed in to change notification settings - Fork 0
/
estimate_wastewater_re.R
239 lines (218 loc) · 9.31 KB
/
estimate_wastewater_re.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
# This script is to estimate Re from WW influenza concentrations.
# It's based on JH's script https://github.com/JSHuisman/wastewaterRe/blob/main/code/wastewater_Zurich.R.
# Load dependencies
library(dplyr)
library(pracma) # for cubic spline interpolation
library(tidyr)
library(readr)
library(estimateR)
source("R/helper_scripts/functions.R")
source("R/helper_scripts/parameters.R")
source("R/helper_scripts/fitopts.R")
# Set variables
output_suffix <- "" # Dashboard uses data with default output name, i.e. output_suffix = ""
delay_dist_info <- influenza_distribution_infection_to_shedding_carrat2008_moments
mean_serial_interval <- influenza_mean_serial_interval_days
std_serial_interval <- influenza_std_serial_interval_days
estimation_window <- 3 # 3 is EpiEstim default
minimum_cumul_incidence <- 12 # minimum cumulative number of infections for Re to be estimated, EstimateR default is 12
seasons_to_calculate <- c("2022/23", "2023/24") # list of seasons to calculate estimates for (2021/22 is cached)
n_bootstrap_reps <- 50
# Import data
ww_data_bs <- read_csv("data/clean_data_bs.csv", col_types = cols(sample_date = "D")) %>%
filter(measuring_period %in% seasons_to_calculate) %>%
pivot_wider(
id_cols = c("sample_date", "measuring_period", "wwtp"),
names_from = "measurement_type",
values_from = "mean")
ww_data_gr <- read_csv("data/clean_data_eawag.csv", col_types = cols(sample_date = "D")) %>%
filter(measuring_period %in% seasons_to_calculate) %>%
filter(wwtp == "ARA Chur") %>%
pivot_wider(
id_cols = c("sample_date", "measuring_period", "wwtp"),
names_from = "measurement_type",
values_from = "mean")
ww_data_sg <- read_csv("data/clean_data_eawag.csv", col_types = cols(sample_date = "D")) %>%
filter(measuring_period %in% seasons_to_calculate) %>%
filter(wwtp == "ARA Altenrhein") %>%
pivot_wider(
id_cols = c("sample_date", "measuring_period", "wwtp"),
names_from = "measurement_type",
values_from = "mean")
ww_data_fr <- read_csv("data/clean_data_eawag.csv", col_types = cols(sample_date = "D")) %>%
filter(measuring_period %in% seasons_to_calculate) %>%
filter(wwtp == "ARA Sensetal") %>%
pivot_wider(
id_cols = c("sample_date", "measuring_period", "wwtp"),
names_from = "measurement_type",
values_from = "mean")
ww_data_ti <- read_csv("data/clean_data_eawag.csv", col_types = cols(sample_date = "D")) %>%
filter(measuring_period %in% seasons_to_calculate) %>%
filter(wwtp == "CDA Lugano") %>%
pivot_wider(
id_cols = c("sample_date", "measuring_period", "wwtp"),
names_from = "measurement_type",
values_from = "mean")
ww_data_ge <- read_csv("data/clean_data_eawag.csv", col_types = cols(sample_date = "D")) %>%
filter(measuring_period %in% seasons_to_calculate) %>%
filter(wwtp == "STEP Aire") %>%
pivot_wider(
id_cols = c("sample_date", "measuring_period", "wwtp"),
names_from = "measurement_type",
values_from = "mean")
ww_data_zh <- read_csv("data/clean_data_eawag.csv", col_types = cols(sample_date = "D")) %>%
filter(measuring_period %in% seasons_to_calculate) %>%
filter(wwtp == "ARA Werdhölzli") %>%
pivot_wider(
id_cols = c("sample_date", "measuring_period", "wwtp"),
names_from = "measurement_type",
values_from = "mean")
wwtp_data_all <- list(
"Basel" = ww_data_bs,
"Chur" = ww_data_gr,
"Altenrhein" = ww_data_sg,
"Laupen" = ww_data_fr,
"Lugano" = ww_data_ti,
"Geneva" = ww_data_ge,
"Zurich" = ww_data_zh
)
# Note that this does not properly weigh the representativeness of the different
# treatment plants (i.e. population sizes and how loads translate into
# measurements at the specific sampling site)
# ww_data_all_combined <- bind_rows(wwtp_data_all) %>%
# group_by(sample_date, measuring_period) %>%
# summarize(across(-wwtp,mean,na.rm=T), .groups = "drop") %>%
# mutate(wwtp = "All provided data", .after = measuring_period)
# wwtp_data_all <- c(wwtp_data_all, list("All provided data" = ww_data_all_combined))
# Estimate Re for each data stream
is_first <- T
for (wwtp_name in names(wwtp_data_all)) {
set.seed(estimateR_seed)
df <- wwtp_data_all[[wwtp_name]]
wwtp_i <- unique(df$wwtp) # should be single plant
# Normalize daily copies by minimum value to get observations on the same scale as case data
# See Huisman et al. 2022 Environmental Health Perspectives -
# deconvolution algorithm seems to struggle when observations are too peaked
data_normalized <- tryCatch(
{
suppressWarnings(normalization_factor <- min(
df$IAV_gc_per_day[df$IAV_gc_per_day > 0],
df$IBV_gc_per_day[df$IBV_gc_per_day > 0],
na.rm = T))
suppressWarnings(df %>%
mutate(IAV_gc_per_day_norm = IAV_gc_per_day / normalization_factor) %>%
mutate(IBV_gc_per_day_norm = IBV_gc_per_day / normalization_factor))
},
error = function(cond) {
message(paste("Couldn't calculate Re for", wwtp_name,
"because normalization failed (probably due to no measurements this season.)"))
return(cond)
}
)
if(inherits(data_normalized, "error")) {
next # if normalization fails, skip this wwtp
}
data_long <- data_normalized %>% pivot_longer(
cols = c(IAV_gc_per_day, IBV_gc_per_day, IAV_gc_per_day_norm, IBV_gc_per_day_norm),
values_to = "observation",
names_to = c("influenza_type", "observation_units"),
names_pattern = "([A-Z]{3})_(.*)"
) |> select(sample_date, measuring_period, wwtp, influenza_type, observation_units, observation)
# Iterate over influenza types and seasons (if multiple)
for (influenza_type_j in unique(data_long$influenza_type)) {
for (measuring_period_k in unique(data_long$measuring_period)) {
writeLines(paste("\nEstimating Re for", wwtp_i, "influenza", influenza_type_j,
"in period", measuring_period_k))
# Get appropriate data
data_filtered <- data_long %>%
filter(
influenza_type == influenza_type_j,
measuring_period == measuring_period_k,
observation_units == "gc_per_day_norm") %>%
filter(!is.na(observation)) %>% # this is just to remove leading NA measurements for ZH IBV
arrange(sample_date)
# Try to estimate Re (handling case where not enough incidence observed to calculate)
estimates_bootstrap <- tryCatch({
# Interpolate measurements to daily values
data_interpolated <- interpolate_measurements(
data_frame = data_filtered,
date_col = "sample_date",
measurement_cols = c("observation")
)
measurements <- list(
values = data_interpolated$observation,
index_offset = 0)
get_block_bootstrapped_estimate(
measurements$values,
N_bootstrap_replicates = n_bootstrap_reps,
smoothing_method = "LOESS",
deconvolution_method = "Richardson-Lucy delay distribution",
estimation_method = "EpiEstim sliding window",
uncertainty_summary_method = "original estimate - CI from bootstrap estimates",
minimum_cumul_incidence = minimum_cumul_incidence,
combine_bootstrap_and_estimation_uncertainties = TRUE,
delay = delay_dist_info,
estimation_window = estimation_window,
mean_serial_interval = mean_serial_interval,
std_serial_interval = std_serial_interval,
ref_date = min(data_filtered$sample_date),
time_step = "day",
output_Re_only = F) %>%
mutate(observation_type = wwtp_i, influenza_type = influenza_type_j, measuring_period = measuring_period_k)
},
error = function(cond) {
message(paste("Couldn't calculate Re for", wwtp_i, "influenza", influenza_type_j))
message(cond)
# Make observation data frame anyways
if (nrow(data_filtered) == 0) {
warning("No measurements found")
dates <- NA
observations <- NA
} else {
dates <- data_filtered$sample_date
observations <- data_filtered$observation
}
return(data.frame(
date = dates,
observed_incidence = observations,
CI_down_observed_incidence = NA,
CI_up_observed_incidence = NA,
smoothed_incidence = NA,
CI_down_smoothed_incidence = NA,
CI_up_smoothed_incidence = NA,
deconvolved_incidence = NA,
CI_down_deconvolved_incidence = NA,
CI_up_deconvolved_incidence = NA,
Re_estimate = NA,
CI_down_Re_estimate = NA,
CI_up_Re_estimate = NA,
Re_highHPD = NA,
Re_lowHPD = NA,
bootstrapped_CI_down_Re_estimate = NA,
bootstrapped_CI_up_Re_estimate = NA,
observation_type = wwtp_i,
influenza_type = influenza_type_j,
measuring_period = measuring_period_k
))
})
# Aggregate data and Re estimates
if (is_first) {
data_all <- data_interpolated
estimates_bootstrap_all <- estimates_bootstrap
is_first <- F
} else {
data_all <- rbind(data_all, data_interpolated)
estimates_bootstrap_all <- rbind(estimates_bootstrap_all, estimates_bootstrap)
}
}
}
}
# Write out data used for Re inference and results
write.csv(
x = data_all,
file = paste0("app/data/ww_loads", output_suffix, ".csv")
)
write.csv(
x = estimates_bootstrap_all,
file = paste0("app/data/ww_re_estimates", output_suffix, ".csv")
)