Skip to content

Commit c536836

Browse files
committed
add features
1 parent fc2f70d commit c536836

File tree

2 files changed

+68
-19
lines changed

2 files changed

+68
-19
lines changed

R/features-exp.R

Lines changed: 57 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -6,16 +6,19 @@ extract_features_exp <- function(x) {
66
x <- x %>%
77
dplyr::group_by(user_id) %>%
88
dplyr::summarise(
9+
## tweets features
910
n_sincelast = count_mean(since_last(.data$created_at)),
1011
n_timeofday = count_mean(hourofweekday(.data$created_at)),
1112
n = dplyr::n(),
1213
n_retweets = sum_(.data$is_retweet),
1314
n_quotes = sum_(.data$is_quote),
15+
n_langs = tfse::n_uq(.data$lang),
1416
retweet_count = mean_(c(0, .data$retweet_count)),
1517
favorite_count = mean_(c(0, .data$favorite_count)),
16-
favourites_count = max_(c(0, .data$favourites_count)),
1718
n_tweets = sum_(!.data$is_retweet & !.data$is_quote),
18-
19+
n_places = sum_(!is.na(.data$place_name)),
20+
n_geo_coords = ncoord(.data$geo_coords),
21+
n_bbox_coords = ncoord(.data$bbox_coords),
1922
iphone = sum_("Twitter for iPhone" %in% .data$source) / .data$n,
2023
webclient = sum_("Twitter Web Client" %in% .data$source) / .data$n,
2124
android = sum_("Twitter for Android" %in% .data$source) / .data$n,
@@ -25,37 +28,43 @@ extract_features_exp <- function(x) {
2528
google = sum_("Google" %in% .data$source) / .data$n,
2629
ifttt = sum_("IFTTT" %in% .data$source) / .data$n,
2730
facebook = sum_("Facebook" %in% .data$source) / .data$n,
28-
2931
twittbotnet = sum_("twittbot.net" %in% .data$source) / .data$n,
3032
tweetdeck = sum_("TweetDeck" %in% .data$source) / .data$n,
31-
twitterforblackberry = sum_("Twitter for BlackBerry®" %in% .data$source) / .data$n,
33+
twitterforblackberry = sum_(
34+
"Twitter for BlackBerry®" %in% .data$source) / .data$n,
3235
dlvrit = sum_("dlvr.it" %in% .data$source) / .data$n,
3336
instagram = sum_("Instagram" %in% .data$source) / .data$n,
3437
curiouscat = sum_("Curious Cat" %in% .data$source) / .data$n,
3538
echofon = sum_("Echofon" %in% .data$source) / .data$n,
36-
ubersocialforblackberry = sum_("UberSocial for BlackBerry" %in% .data$source) / .data$n,
39+
ubersocialforblackberry = sum_(
40+
"UberSocial for BlackBerry" %in% .data$source) / .data$n,
3741
athkarapp = sum_("athkarApp" %in% .data$source) / .data$n,
3842
mobilewebm2 = sum_("Mobile Web (M2)" %in% .data$source) / .data$n,
3943
twitterfeed = sum_("twitterfeed" %in% .data$source) / .data$n,
4044
tweetbotforiοs = sum_("Tweetbot for iΟS" %in% .data$source) / .data$n,
41-
tweetcasterforandroid = sum_("TweetCaster for Android" %in% .data$source) / .data$n,
42-
twitcomcomunidades = sum_("Twitcom - Comunidades " %in% .data$source) / .data$n,
45+
tweetcasterforandroid = sum_(
46+
"TweetCaster for Android" %in% .data$source) / .data$n,
47+
twitcomcomunidades = sum_(
48+
"Twitcom - Comunidades " %in% .data$source) / .data$n,
4349
cloudhopper = sum_("Cloudhopper" %in% .data$source) / .data$n,
4450
twicca = sum_("twicca" %in% .data$source) / .data$n,
4551
wordpresscom = sum_("WordPress.com" %in% .data$source) / .data$n,
4652
mobileweb = sum_("Mobile Web" %in% .data$source) / .data$n,
4753
foursquare = sum_("Foursquare" %in% .data$source) / .data$n,
4854
showroomlive = sum_("SHOWROOM-LIVE" %in% .data$source) / .data$n,
49-
twitterforwebsites = sum_("Twitter for Websites" %in% .data$source) / .data$n,
55+
twitterforwebsites = sum_(
56+
"Twitter for Websites" %in% .data$source) / .data$n,
5057
ios = sum_("iOS" %in% .data$source) / .data$n,
5158
tumblr = sum_("Tumblr" %in% .data$source) / .data$n,
5259
tweetlogix = sum_("Tweetlogix" %in% .data$source) / .data$n,
5360
socialoomph = sum_("SocialOomph" %in% .data$source) / .data$n,
5461
buffer = sum_("Buffer" %in% .data$source) / .data$n,
5562
twitcleplus = sum_("twitcle plus" %in% .data$source) / .data$n,
5663
keitaiweb = sum_("Keitai Web" %in% .data$source) / .data$n,
57-
sandaysoftcumulus = sum_("Sandaysoft Cumulus" %in% .data$source) / .data$n,
58-
twitpaneforandroid = sum_("TwitPane for Android" %in% .data$source) / .data$n,
64+
sandaysoftcumulus = sum_(
65+
"Sandaysoft Cumulus" %in% .data$source) / .data$n,
66+
twitpaneforandroid = sum_(
67+
"TwitPane for Android" %in% .data$source) / .data$n,
5968
playstationr4 = sum_("PlayStation(R)4" %in% .data$source) / .data$n,
6069
writelonger = sum_("Write Longer" %in% .data$source) / .data$n,
6170
featherforios = sum_("feather for iOS " %in% .data$source) / .data$n,
@@ -66,28 +75,50 @@ extract_features_exp <- function(x) {
6675
janetter = sum_("Janetter" %in% .data$source) / .data$n,
6776
dynamictweets = sum_("Dynamic Tweets" %in% .data$source) / .data$n,
6877
twitcasting = sum_("TwitCasting" %in% .data$source) / .data$n,
69-
ubersocialforandroid = sum_("UberSocial for Android" %in% .data$source) / .data$n,
70-
janetterforandroid = sum_("Janetter for Android" %in% .data$source) / .data$n,
71-
twitterforandroidtablets = sum_("Twitter for Android Tablets" %in% .data$source) / .data$n,
78+
ubersocialforandroid = sum_(
79+
"UberSocial for Android" %in% .data$source) / .data$n,
80+
janetterforandroid = sum_(
81+
"Janetter for Android" %in% .data$source) / .data$n,
82+
twitterforandroidtablets = sum_(
83+
"Twitter for Android Tablets" %in% .data$source) / .data$n,
7284
twitterformac = sum_("Twitter for Mac" %in% .data$source) / .data$n,
7385

86+
## users features
87+
lang_und = as.integer(.data$account_lang[1] == "und"),
88+
lang_tr = as.integer(.data$account_lang[1] == "tr"),
89+
lang_ru = as.integer(.data$account_lang[1] == "ru"),
90+
lang_pt = as.integer(.data$account_lang[1] == "pt"),
91+
lang_ja = as.integer(.data$account_lang[1] == "ja"),
92+
lang_in = as.integer(.data$account_lang[1] == "in"),
93+
lang_fr = as.integer(.data$account_lang[1] == "fr"),
94+
lang_es = as.integer(.data$account_lang[1] == "es"),
95+
lang_en = as.integer(.data$account_lang[1] == "en"),
96+
lang_are = as.integer(.data$account_lang[1] == "ar"),
97+
lang_de = as.integer(.data$account_lang[1] == "de"),
98+
lang_it = as.integer(.data$account_lang[1] == "it"),
99+
lang_id = as.integer(.data$account_lang[1] == "id"),
100+
lang_ko = as.integer(.data$account_lang[1] == "ko"),
101+
lang_nl = as.integer(.data$account_lang[1] == "nl"),
102+
lang_hi = as.integer(.data$account_lang[1] == "hi"),
103+
lang_fil = as.integer(.data$account_lang[1] == "fil"),
104+
lang_th = as.integer(.data$account_lang[1] == "th"),
105+
lang_engb = as.integer(.data$account_lang[1] == "en-gb"),
106+
screen_name_alpha = nchar_(.data$screen_name[1]),
107+
screen_name_num = ndigit_(.data$screen_name[1]),
74108
prof_image_na = sum_(is.na(.data$profile_image_url[1])),
75109
prof_image_type = sum_(grepl("\\.jpg", .data$profile_image_url[1])),
76-
77110
profile_bg_na = sum_(is.na(.data$profile_background_url[1])),
78111
profile_bg_type = sum_(grepl("\\.png", .data$profile_background_url[1])),
79-
80112
profile_bn_na = sum_(is.na(.data$profile_banner_url[1])),
81-
82113
verified = as.integer(.data$verified[1]),
114+
profile_url = !is.na(.data$profile_url[1]),
83115
years_on_twitter = relative_twitter_age(.data$account_created_at[1]),
84116
tweets_per_year = .data$n_tweets / (1 + .data$years_on_twitter),
85-
86-
## i added one here so it wouldn't return NaN or undefined values (0 / x)
87117
statuses_count = max_(c(0, .data$statuses_count)),
88118
followers_count = max_(c(0, .data$followers_count)),
89119
friends_count = max_(c(0, .data$friends_count)),
90120
listed_count = max_(c(0, .data$listed_count)),
121+
favourites_count = max_(c(0, .data$favourites_count)),
91122
tweets_to_followers = (.data$statuses_count + 1) /
92123
(.data$followers_count + 1),
93124
statuses_rate = (.data$statuses_count + 1) /
@@ -106,8 +137,15 @@ age_of_twitter <- function() {
106137
}
107138

108139
relative_twitter_age <- function(account_created_at) {
109-
years <- as.numeric(difftime(Sys.time(), account_created_at, units = "days"))/365
140+
years <- as.numeric(difftime(
141+
Sys.time(), account_created_at, units = "days"))/365
110142
aot <- age_of_twitter()
111143
## set it at 15
112144
(years / aot) * 15
113145
}
146+
147+
ncoord <- function(x) {
148+
sum(vapply(x, function(.x) !is.na(.x[1]), integer(1), USE.NAMES = FALSE))
149+
}
150+
151+

R/utils.R

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -67,3 +67,14 @@ count_mean <- function(x) {
6767
x <- as.integer(x) - 1L
6868
mean(x, na.rm = TRUE)
6969
}
70+
71+
72+
nchar_ <- function(x) {
73+
ifelse(is.na(x), 0, nchar(x))
74+
}
75+
76+
77+
ndigit_ <- function(x) {
78+
ifelse(is.na(x), 0, nchar(gsub("\\D", "", x)))
79+
}
80+

0 commit comments

Comments
 (0)