@@ -6,16 +6,19 @@ extract_features_exp <- function(x) {
6
6
x <- x %> %
7
7
dplyr :: group_by(user_id ) %> %
8
8
dplyr :: summarise(
9
+ # # tweets features
9
10
n_sincelast = count_mean(since_last(.data $ created_at )),
10
11
n_timeofday = count_mean(hourofweekday(.data $ created_at )),
11
12
n = dplyr :: n(),
12
13
n_retweets = sum_(.data $ is_retweet ),
13
14
n_quotes = sum_(.data $ is_quote ),
15
+ n_langs = tfse :: n_uq(.data $ lang ),
14
16
retweet_count = mean_(c(0 , .data $ retweet_count )),
15
17
favorite_count = mean_(c(0 , .data $ favorite_count )),
16
- favourites_count = max_(c(0 , .data $ favourites_count )),
17
18
n_tweets = sum_(! .data $ is_retweet & ! .data $ is_quote ),
18
-
19
+ n_places = sum_(! is.na(.data $ place_name )),
20
+ n_geo_coords = ncoord(.data $ geo_coords ),
21
+ n_bbox_coords = ncoord(.data $ bbox_coords ),
19
22
iphone = sum_(" Twitter for iPhone" %in% .data $ source ) / .data $ n ,
20
23
webclient = sum_(" Twitter Web Client" %in% .data $ source ) / .data $ n ,
21
24
android = sum_(" Twitter for Android" %in% .data $ source ) / .data $ n ,
@@ -25,37 +28,43 @@ extract_features_exp <- function(x) {
25
28
google = sum_(" Google" %in% .data $ source ) / .data $ n ,
26
29
ifttt = sum_(" IFTTT" %in% .data $ source ) / .data $ n ,
27
30
facebook = sum_(" Facebook" %in% .data $ source ) / .data $ n ,
28
-
29
31
twittbotnet = sum_(" twittbot.net" %in% .data $ source ) / .data $ n ,
30
32
tweetdeck = sum_(" TweetDeck" %in% .data $ source ) / .data $ n ,
31
- twitterforblackberry = sum_(" Twitter for BlackBerry®" %in% .data $ source ) / .data $ n ,
33
+ twitterforblackberry = sum_(
34
+ " Twitter for BlackBerry®" %in% .data $ source ) / .data $ n ,
32
35
dlvrit = sum_(" dlvr.it" %in% .data $ source ) / .data $ n ,
33
36
instagram = sum_(" Instagram" %in% .data $ source ) / .data $ n ,
34
37
curiouscat = sum_(" Curious Cat" %in% .data $ source ) / .data $ n ,
35
38
echofon = sum_(" Echofon" %in% .data $ source ) / .data $ n ,
36
- ubersocialforblackberry = sum_(" UberSocial for BlackBerry" %in% .data $ source ) / .data $ n ,
39
+ ubersocialforblackberry = sum_(
40
+ " UberSocial for BlackBerry" %in% .data $ source ) / .data $ n ,
37
41
athkarapp = sum_(" athkarApp" %in% .data $ source ) / .data $ n ,
38
42
mobilewebm2 = sum_(" Mobile Web (M2)" %in% .data $ source ) / .data $ n ,
39
43
twitterfeed = sum_(" twitterfeed" %in% .data $ source ) / .data $ n ,
40
44
tweetbotfori οs = sum_(" Tweetbot for iΟS" %in% .data $ source ) / .data $ n ,
41
- tweetcasterforandroid = sum_(" TweetCaster for Android" %in% .data $ source ) / .data $ n ,
42
- twitcomcomunidades = sum_(" Twitcom - Comunidades " %in% .data $ source ) / .data $ n ,
45
+ tweetcasterforandroid = sum_(
46
+ " TweetCaster for Android" %in% .data $ source ) / .data $ n ,
47
+ twitcomcomunidades = sum_(
48
+ " Twitcom - Comunidades " %in% .data $ source ) / .data $ n ,
43
49
cloudhopper = sum_(" Cloudhopper" %in% .data $ source ) / .data $ n ,
44
50
twicca = sum_(" twicca" %in% .data $ source ) / .data $ n ,
45
51
wordpresscom = sum_(" WordPress.com" %in% .data $ source ) / .data $ n ,
46
52
mobileweb = sum_(" Mobile Web" %in% .data $ source ) / .data $ n ,
47
53
foursquare = sum_(" Foursquare" %in% .data $ source ) / .data $ n ,
48
54
showroomlive = sum_(" SHOWROOM-LIVE" %in% .data $ source ) / .data $ n ,
49
- twitterforwebsites = sum_(" Twitter for Websites" %in% .data $ source ) / .data $ n ,
55
+ twitterforwebsites = sum_(
56
+ " Twitter for Websites" %in% .data $ source ) / .data $ n ,
50
57
ios = sum_(" iOS" %in% .data $ source ) / .data $ n ,
51
58
tumblr = sum_(" Tumblr" %in% .data $ source ) / .data $ n ,
52
59
tweetlogix = sum_(" Tweetlogix" %in% .data $ source ) / .data $ n ,
53
60
socialoomph = sum_(" SocialOomph" %in% .data $ source ) / .data $ n ,
54
61
buffer = sum_(" Buffer" %in% .data $ source ) / .data $ n ,
55
62
twitcleplus = sum_(" twitcle plus" %in% .data $ source ) / .data $ n ,
56
63
keitaiweb = sum_(" Keitai Web" %in% .data $ source ) / .data $ n ,
57
- sandaysoftcumulus = sum_(" Sandaysoft Cumulus" %in% .data $ source ) / .data $ n ,
58
- twitpaneforandroid = sum_(" TwitPane for Android" %in% .data $ source ) / .data $ n ,
64
+ sandaysoftcumulus = sum_(
65
+ " Sandaysoft Cumulus" %in% .data $ source ) / .data $ n ,
66
+ twitpaneforandroid = sum_(
67
+ " TwitPane for Android" %in% .data $ source ) / .data $ n ,
59
68
playstationr4 = sum_(" PlayStation(R)4" %in% .data $ source ) / .data $ n ,
60
69
writelonger = sum_(" Write Longer" %in% .data $ source ) / .data $ n ,
61
70
featherforios = sum_(" feather for iOS " %in% .data $ source ) / .data $ n ,
@@ -66,28 +75,50 @@ extract_features_exp <- function(x) {
66
75
janetter = sum_(" Janetter" %in% .data $ source ) / .data $ n ,
67
76
dynamictweets = sum_(" Dynamic Tweets" %in% .data $ source ) / .data $ n ,
68
77
twitcasting = sum_(" TwitCasting" %in% .data $ source ) / .data $ n ,
69
- ubersocialforandroid = sum_(" UberSocial for Android" %in% .data $ source ) / .data $ n ,
70
- janetterforandroid = sum_(" Janetter for Android" %in% .data $ source ) / .data $ n ,
71
- twitterforandroidtablets = sum_(" Twitter for Android Tablets" %in% .data $ source ) / .data $ n ,
78
+ ubersocialforandroid = sum_(
79
+ " UberSocial for Android" %in% .data $ source ) / .data $ n ,
80
+ janetterforandroid = sum_(
81
+ " Janetter for Android" %in% .data $ source ) / .data $ n ,
82
+ twitterforandroidtablets = sum_(
83
+ " Twitter for Android Tablets" %in% .data $ source ) / .data $ n ,
72
84
twitterformac = sum_(" Twitter for Mac" %in% .data $ source ) / .data $ n ,
73
85
86
+ # # users features
87
+ lang_und = as.integer(.data $ account_lang [1 ] == " und" ),
88
+ lang_tr = as.integer(.data $ account_lang [1 ] == " tr" ),
89
+ lang_ru = as.integer(.data $ account_lang [1 ] == " ru" ),
90
+ lang_pt = as.integer(.data $ account_lang [1 ] == " pt" ),
91
+ lang_ja = as.integer(.data $ account_lang [1 ] == " ja" ),
92
+ lang_in = as.integer(.data $ account_lang [1 ] == " in" ),
93
+ lang_fr = as.integer(.data $ account_lang [1 ] == " fr" ),
94
+ lang_es = as.integer(.data $ account_lang [1 ] == " es" ),
95
+ lang_en = as.integer(.data $ account_lang [1 ] == " en" ),
96
+ lang_are = as.integer(.data $ account_lang [1 ] == " ar" ),
97
+ lang_de = as.integer(.data $ account_lang [1 ] == " de" ),
98
+ lang_it = as.integer(.data $ account_lang [1 ] == " it" ),
99
+ lang_id = as.integer(.data $ account_lang [1 ] == " id" ),
100
+ lang_ko = as.integer(.data $ account_lang [1 ] == " ko" ),
101
+ lang_nl = as.integer(.data $ account_lang [1 ] == " nl" ),
102
+ lang_hi = as.integer(.data $ account_lang [1 ] == " hi" ),
103
+ lang_fil = as.integer(.data $ account_lang [1 ] == " fil" ),
104
+ lang_th = as.integer(.data $ account_lang [1 ] == " th" ),
105
+ lang_engb = as.integer(.data $ account_lang [1 ] == " en-gb" ),
106
+ screen_name_alpha = nchar_(.data $ screen_name [1 ]),
107
+ screen_name_num = ndigit_(.data $ screen_name [1 ]),
74
108
prof_image_na = sum_(is.na(.data $ profile_image_url [1 ])),
75
109
prof_image_type = sum_(grepl(" \\ .jpg" , .data $ profile_image_url [1 ])),
76
-
77
110
profile_bg_na = sum_(is.na(.data $ profile_background_url [1 ])),
78
111
profile_bg_type = sum_(grepl(" \\ .png" , .data $ profile_background_url [1 ])),
79
-
80
112
profile_bn_na = sum_(is.na(.data $ profile_banner_url [1 ])),
81
-
82
113
verified = as.integer(.data $ verified [1 ]),
114
+ profile_url = ! is.na(.data $ profile_url [1 ]),
83
115
years_on_twitter = relative_twitter_age(.data $ account_created_at [1 ]),
84
116
tweets_per_year = .data $ n_tweets / (1 + .data $ years_on_twitter ),
85
-
86
- # # i added one here so it wouldn't return NaN or undefined values (0 / x)
87
117
statuses_count = max_(c(0 , .data $ statuses_count )),
88
118
followers_count = max_(c(0 , .data $ followers_count )),
89
119
friends_count = max_(c(0 , .data $ friends_count )),
90
120
listed_count = max_(c(0 , .data $ listed_count )),
121
+ favourites_count = max_(c(0 , .data $ favourites_count )),
91
122
tweets_to_followers = (.data $ statuses_count + 1 ) /
92
123
(.data $ followers_count + 1 ),
93
124
statuses_rate = (.data $ statuses_count + 1 ) /
@@ -106,8 +137,15 @@ age_of_twitter <- function() {
106
137
}
107
138
108
139
relative_twitter_age <- function (account_created_at ) {
109
- years <- as.numeric(difftime(Sys.time(), account_created_at , units = " days" ))/ 365
140
+ years <- as.numeric(difftime(
141
+ Sys.time(), account_created_at , units = " days" ))/ 365
110
142
aot <- age_of_twitter()
111
143
# # set it at 15
112
144
(years / aot ) * 15
113
145
}
146
+
147
+ ncoord <- function (x ) {
148
+ sum(vapply(x , function (.x ) ! is.na(.x [1 ]), integer(1 ), USE.NAMES = FALSE ))
149
+ }
150
+
151
+
0 commit comments