-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathnew_data.R
108 lines (83 loc) · 3.46 KB
/
new_data.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
## @knitr new_data
#Generate data for posterior_predict(newdata)
#Bootstrap from the original data the desired nunmber of times
#n_new is defined globally
if(!exists("n_new")){warning("setting n_new to 200"); n_new <- 200}
df_new <- df[sample(1:nrow(df), n_new, replace=TRUE),]
#summary(df_new)
#force the randomisation argument to be an equal split seperate from teh original data
n <- nrow(df_new)
n_arms <- nlevels(df_new$rx)
n_per_arm <- n%/%n_arms
spare <- n%%n_arms
df_new$rx <- factor(c(rep(1:n_arms, n_per_arm),rep(1,spare)), labels=levels(df$rx))
df_new$id <- 1:nrow(df_new)
#convert to long format with a row for each day
long <- expand.grid(time=sort(fail_times), id=df_new$id)
df_new_long <- left_join(long, df_new %>% select(-time), by="id")
# FUnction to add back in a simulated status endppoint
# and convert to standard survival data : either long counting, or short time/status
add_sim_status <- function(df, posterior,...){
# ... give the covariates to keep.
## future_analysis doesn't pass this on directly at the moment
require(dplyr)
require(magrittr)
# this works one row at a time for posterior
df$status <- posterior
df_fail_time <- df %>% group_by(id) %>%
filter(status>0) %>% summarise( fail_time=min(time), .groups="drop")
# Say you had a discrete survival distribution to simulate from
# geenrate a U(0,1) and then invert the CDF
# grouping together the poisson outcomes >=1, does this in effect
# a value >1 means the precise failure occured earlier, but within that interval.
df_long<- left_join(df, df_fail_time, by=c("id"="id")) %>%
mutate( fail_time=ifelse(is.na(fail_time),Inf, fail_time)) %>%
filter( time<=fail_time)
covars <- c(quos(id,rx), quos(...))
df_short <- df_long %>% group_by(!!!covars) %>%
summarise(time=max(time), status=min(1,max(status)), .groups="drop")
list("long"=df_long, "short"=df_short)
}
#test
if(FALSE){
ans <- add_sim_status(df_new_long , pred_data[6,],"rx")
plot(survfit(Surv(time, status)~rx, data=ans$short))
plot(survfit(Surv(time, status)~rx, data=df))
df %>% group_by(rx) %>% summarise(event=mean(status))
ans$short %>% group_by(rx) %>% summarise(event=mean(status))
mean(df$status)
mean(ans$short$status)
}
### Now add back in censoring ...
# Generate a censoring distribution survfit(Surv(time, 1- status)~1).
censor_dist <- survfit(Surv(time, 1-status)~1, data=df)
time <- censor_dist$time
S <- censor_dist$surv
p <- -diff(c(1,S))
index <- which(p!=0)
censor_dist <- data.frame(time=time[index], prob=p[index])
# But add in an admin censor at the end if needed - might vary between patients if we have calendar censoring.
admin_cens <- 15
# make this the last day+1
# the rule will be that censoring wins for ties
# so this allows people to fail on day 14, rather than being censored - if they get day 15
early <- subset(censor_dist, time <admin_cens)
prob_cens <- sum(early$prob)
censor_dist <- rbind( early,
c(admin_cens, 1-prob_cens)
)
add_censor_sample <- function(data, censor_dist){
n <- nrow(data)
data$censor_time <- sample(censor_dist$time, n, replace=TRUE, prob=censor_dist$prob)
data <- mutate(data,
time=pmin(time, censor_time),
status=ifelse(censor_time<=time, 0, status)
)
data
}
#test
if(FALSE){
survfit(Surv(time, status)~rx, data=ans$short) %>% plot(mark.time=TRUE)
ans2 <- add_censor_sample(ans$short, censor_dist)
survfit(Surv(time, status)~rx, data=ans2) %>% plot(mark.time=TRUE)
}