-
Notifications
You must be signed in to change notification settings - Fork 18
/
Copy pathdensity.js
241 lines (219 loc) · 6.57 KB
/
density.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
/**
* Density v0.1.0
* MIT licensed
*
* Copyright (C) 2012 usabli.ca - By Afshin Mehrabani
*/
(function () {
//Default config/variables
var VERSION = "0.1.0";
//Check for nodeJS
var hasModule = (typeof module !== 'undefined' && module.exports);
//import node modules
var fs = require('fs');
/**
* Density main class
*
* @class Density
*/
function Density(str) {
this._str = str.toLowerCase();
//default options
this._options = {
stopWordFile: __dirname + "/stopwords.json",
minKeywordLength: 2,
maxKeywordLength: 50
};
}
/**
* Remove all stop words from the text from given setting file
*
* @method _removeStopWords
*/
function _removeStopWords() {
var fileData = fs.readFileSync(this._options.stopWordFile, 'utf8').toLowerCase();
var stopwords = JSON.parse(fileData);
for (var i = stopwords.length - 1; i >= 0; i--) {
var regex = new RegExp("( |^)" + stopwords[i].replace(/([.*+?^=!:${}()|[\]\/\\])/g, "\\$1") + "( |$)", "g");
this._str = this._str.replace(regex, "$1$2");
};
}
/**
* Convert HTML to Text
* Thanks to: https://github.com/mtrimpe/jsHtmlToText
*
* @method _htmlToText
*/
function _htmlToText() {
var text = this._str;
text = text
// Remove line breaks
.replace(/(?:\n|\r\n|\r)/ig, " ")
// Remove content in script tags.
.replace(/<\s*script[^>]*>[\s\S]*?<\/script>/mig, "")
// Remove content in style tags.
.replace(/<\s*style[^>]*>[\s\S]*?<\/style>/mig, "")
// Remove content in comments.
.replace(/<!--.*?-->/mig, "")
// Remove !DOCTYPE
.replace(/<!DOCTYPE.*?>/ig, "");
/* I scanned http://en.wikipedia.org/wiki/HTML_element for all html tags.
I put those tags that should affect plain text formatting in two categories:
those that should be replaced with two newlines and those that should be
replaced with one newline. */
var doubleNewlineTags = ['p', 'h[1-6]', 'dl', 'dt', 'dd', 'ol', 'ul', 'dir', 'address', 'blockquote', 'center', 'div', 'hr', 'pre', 'form', 'textarea', 'table'];
var singleNewlineTags = ['li', 'del', 'ins', 'fieldset', 'legend','tr', 'th', 'caption', 'thead', 'tbody', 'tfoot'];
for (var i = 0; i < doubleNewlineTags.length; i++) {
var r = RegExp('</?\\s*' + doubleNewlineTags[i] + '[^>]*>', 'ig');
text = text.replace(r, ' ');
}
for (var i = 0; i < singleNewlineTags.length; i++) {
var r = RegExp('<\\s*' + singleNewlineTags[i] + '[^>]*>', 'ig');
text = text.replace(r, ' ');
}
// Replace <br> and <br/> with a single newline
text = text.replace(/<\s*br[^>]*\/?\s*>/ig, ' ');
text = text
// Remove all remaining tags.
.replace(/(<([^>]+)>)/ig, "")
// Trim rightmost whitespaces for all lines
.replace(/([^\n\S]+)\n/g, " ")
.replace(/([^\n\S]+)$/, "")
// Make sure there are never more than two
// consecutive linebreaks.
.replace(/\n{2,}/g, " ")
// Remove newlines at the beginning of the text.
.replace(/^\n+/, "")
// Remove newlines at the end of the text.
.replace(/\n+$/, "")
// Remove HTML entities.
.replace(/&([^;]+);/g, ' ')
//remove all tabs and replace them with whitespace
.replace(/\t/g, " ")
//remove whitespace > 2
.replace(/ {2,}/g, " ");
this._str = text;
}
/**
* Overwrites obj1's values with obj2's and adds obj2's if non existent in obj1
* via: http://stackoverflow.com/questions/171251/how-can-i-merge-properties-of-two-javascript-objects-dynamically
*
* @param obj1
* @param obj2
* @returns obj3 a new object based on obj1 and obj2
*/
function _mergeOptions(obj1, obj2) {
var obj3 = {};
for (var attrname in obj1) {
obj3[attrname] = obj1[attrname];
}
for (var attrname in obj2) {
obj3[attrname] = obj2[attrname];
}
return obj3;
}
/**
* Calculate keyword density in the given text
*
* @method _calculateKeywordsDensity
* @return {Object} Keywords density
*/
function _calculateKeywordsDensity() {
//convert html to text
_htmlToText.call(this);
//remove all stop words
_removeStopWords.call(this);
//split the text with space
var words = this._str.split(" ");
var density = [];
//sort the array
words = words.sort(function (a, b) {
if (a < b) return -1;
if (a > b) return 1;
return 0;
});
//used for store the word count
var currentWordCount = 1;
for (var i = words.length - 1; i >= 0; i--) {
if (words[i].length <= this._options.minKeywordLength || words[i].length >= this._options.maxKeywordLength)
continue;
if (words[i] == words[i - 1]) {
//a new duplicate keyword
++currentWordCount;
} else {
//add the keyword with density to the array
density.push({
word: words[i],
count: currentWordCount
});
//reset the keyword density counter
currentWordCount = 1;
}
}
//sort the array with density of keywords
density = density.sort(function (a, b) {
if (a.count > b.count) return -1;
if (a.count < b.count) return 1;
return 0;
});
return density;
}
var density = function (inputStr) {
if (inputStr === "" || inputStr === null) {
return null;
}
return new Density(inputStr);
};
/**
* Current Density version
*
* @property version
* @type String
*/
density.version = VERSION;
//Prototype
density.fn = Density.prototype = {
clone: function () {
return new Density(this);
},
value: function () {
return this._str;
},
toString: function () {
return this._str.toString();
},
set: function (value) {
this._str = String(value);
return this;
},
setOption: function (option, value) {
this._options[option] = value;
return this;
},
setOptions: function (options) {
this._options = _mergeOptions(this._options, options);
return this;
},
getDensity: function () {
return _calculateKeywordsDensity.call(this);
}
};
//Expose Density
//CommonJS module is defined
if (hasModule) {
module.exports = density;
}
//global ender:false
if (typeof ender === 'undefined') {
// here, `this` means `window` in the browser, or `global` on the server
// add `density` as a global object via a string identifier,
// for Closure Compiler "advanced" mode
this['density'] = density;
}
//global define:false
if (typeof define === 'function' && define.amd) {
define('density', [], function () {
return density;
});
}
})();