-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathusers.R
56 lines (43 loc) · 1.45 KB
/
users.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
require(caret)
require(tidyverse)
# =================================================================== #
# prepare the data for modeling:
bidders <- bids %>%
group_by(bidder_id, outcome) %>%
summarise(
n_bids = n_distinct(bid_id),
n_device = n_distinct(device),
n_auction = n_distinct(auction),
n_merchandise = n_distinct(merchandise),
n_country = n_distinct(country),
n_ip = n_distinct(ip)
) %>% ungroup() # 2,013 x 9
uuu <- read_csv('assets/uuu.csv')
rrr <- read_csv('assets/rrr.csv')
# the following two have majoirty NA; de-select them:
uuu2 <- uuu %>%
select(-c(avg_inter_bid_t_SD, avg_inter_bid_t_SD_diff))
bidders2 <- bidders %>%
select(bidder_id, outcome, n_auction, n_bids) %>%
left_join(uuu2) %>%
left_join(rrr) # 2013 x 42
bidders2 %>%
keep(is.numeric) %>%
map_dbl(~ mean(is.na(.)))
bidders2 <- bidders2 %>%
map_df(~ ifelse(is.na(.), -1, .))
sum(is.na(bidders2)) # 0
bidders2$outcome <- ifelse(bidders2$outcome == 1, 'yes', 'no')
bidders2$outcome <- as.factor(bidders2$outcome)
# prepare the data for modeling: -----------
X <- bidders2 %>% select(- c(outcome, bidder_id))
nzvs <- nearZeroVar(X)
names(X)[nzvs]
X <- X[-nzvs] # remove the near-zero variance variables
cor_vals <- cor(X)
high_cor <- findCorrelation(cor_vals, cutoff = .75)
length(high_cor) # 19
X <- X[-high_cor]
bidders3 <- X
bidders3[['outcome']] <- bidders2$outcome
dim(bidders3) # 2013 x 17