-
Notifications
You must be signed in to change notification settings - Fork 1
/
WordCloudTheWet.R
154 lines (112 loc) · 4.14 KB
/
WordCloudTheWet.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
##################
library(wordcloud)
library(tm)
library(plyr)
library(stringr)
##################
files<-list.files(pattern=".txt")
wordblock<-Corpus(DirSource(getwd()))
wordblock<-tm_map(wordblock, tolower)
#wordblock_no_stop <-tm_map(wordblock, removeWords, stopwords("english"))
png("wordcloud_result.png")
wordcloud(wordblock)
#png("wordcloud_result_nostop.png")
#wordcloud(wordblock_no_stop)
############################
#Now for sentiment
# function score.sentiment
#https://sites.google.com/site/miningtwitter/questions/sentiment/analysis
score.sentiment = function(sentences, pos.words, neg.words, .progress='none')
{
# Parameters
# sentences: vector of text to score
# pos.words: vector of words of postive sentiment
# neg.words: vector of words of negative sentiment
# .progress: passed to laply() to control of progress bar
# create simple array of scores with laply
scores = laply(sentences,
function(sentence, pos.words, neg.words)
{
# remove punctuation
sentence = gsub("[[:punct:]]", "", sentence)
# remove control characters
sentence = gsub("[[:cntrl:]]", "", sentence)
# remove digits?
sentence = gsub('\\d+', '', sentence)
# define error handling function when trying tolower
tryTolower = function(x)
{
# create missing value
y = NA
# tryCatch error
try_error = tryCatch(tolower(x), error=function(e) e)
# if not an error
if (!inherits(try_error, "error"))
y = tolower(x)
# result
return(y)
}
# use tryTolower with sapply
sentence = sapply(sentence, tryTolower)
# split sentence into words with str_split (stringr package)
word.list = str_split(sentence, "\\s+")
words = unlist(word.list)
# compare words to the dictionaries of positive & negative terms
pos.matches = match(words, pos.words)
neg.matches = match(words, neg.words)
# get the position of the matched term or NA
# we just want a TRUE/FALSE
pos.matches = !is.na(pos.matches)
neg.matches = !is.na(neg.matches)
# final score
score = sum(pos.matches) - sum(neg.matches)
return(score)
}, pos.words, neg.words, .progress=.progress )
# data frame with scores for each sentence
scores.df = data.frame(text=sentences, score=scores)
return(scores.df)
}
###This function does it all
# import positive and negative words
pos = readLines("positive_words.txt")
neg = readLines("negative_words.txt")
#############
#https://github.com/jeffreybreen/twitter-sentiment-analysis-tutorial-201107/blob/master/R/sentiment.R
sentence<-wordblock
###########
##scores = laply(sentences, function(sentence, pos.words, neg.words) {
sentence = gsub('[[:punct:]]', '', sentence)
sentence = gsub('[[:cntrl:]]', '', sentence)
sentence = gsub('\\d+', '', sentence)
# and convert to lower case:
sentence = tolower(sentence)
# split into words. str_split is in the stringr package
word.list = str_split(sentence, '\\s+')
# sometimes a list() is one level of hierarchy too much
words = unlist(word.list)
########
pos.words<-pos
neg.words<-neg
########
pos.matches = match(words, pos.words)
neg.matches = match(words, neg.words)
# match() returns the position of the matched term or NA
# we just want a TRUE/FALSE:
pos.matches = !is.na(pos.matches)
neg.matches = !is.na(neg.matches)
# and conveniently enough, TRUE/FALSE will be treated as 1/0 by sum():
score = sum(pos.matches) - sum(neg.matches)
########
# and conveniently enough, TRUE/FALSE will be treated as 1/0 by sum():
score = sum(pos.matches) - sum(neg.matches)
##return(score)
##}, pos.words, neg.words, .progress=.progress )
# scores.df = data.frame(score=score.sentiment, text=sentence)
# return(scores.df)
###############################################
###############################################
write.table(pos.matches, file="pos_matches.out")
write.table(neg.matches, file="neg_matches.out")
write.table(score, file="Overall_Score.txt")
write.table(subset(words, pos.matches), file="PositiveWordList.txt")
write.table(subset(words, neg.matches), file="NegativeWordList.txt")