Skip to content

Commit 01c1252

Browse files
author
Dan Pressel
committed
Initial commit.
1 parent 3e0f05a commit 01c1252

File tree

8 files changed

+26668
-2
lines changed

8 files changed

+26668
-2
lines changed

.jshintrc

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
{
2+
"node": true,
3+
"browser": false,
4+
"es5": true,
5+
"esnext": true,
6+
"bitwise": true,
7+
"camelcase": true,
8+
"curly": true,
9+
"eqeqeq": true,
10+
"immed": true,
11+
"indent": 4,
12+
"latedef": true,
13+
"newcap": true,
14+
"noarg": true,
15+
"quotmark": "single",
16+
"regexp": true,
17+
"undef": true,
18+
"unused": true,
19+
"strict": true,
20+
"trailing": true,
21+
"smarttabs": true,
22+
"white": false,
23+
"globals": { "window": false }
24+
}

Gruntfile.js

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
'use strict';
2+
3+
module.exports = function (grunt) {
4+
// load all grunt tasks
5+
require('matchdep').filterDev('grunt-*').forEach(grunt.loadNpmTasks);
6+
7+
grunt.initConfig({
8+
clean: {
9+
dist: ['.tmp', 'dist/*'],
10+
server: '.tmp'
11+
},
12+
uglify: {
13+
my_target: {
14+
files: {
15+
'dist/textrank.min.js': ['index.js']
16+
}
17+
}
18+
},
19+
jshint: {
20+
options: {
21+
jshintrc: '.jshintrc'
22+
},
23+
all: [
24+
'lib/*.js'
25+
]
26+
},
27+
mochaTest: {
28+
test: {
29+
options: {
30+
reporter: 'spec'
31+
},
32+
src: ['test/*.js']
33+
}
34+
}
35+
});
36+
37+
grunt.registerTask('test', [
38+
'clean',
39+
'mochaTest'
40+
]);
41+
42+
grunt.registerTask('build', [
43+
'clean:dist',
44+
'uglify'
45+
]);
46+
47+
grunt.registerTask('default', [
48+
'jshint',
49+
'test',
50+
'build'
51+
]);
52+
};
53+

README.md

Lines changed: 20 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,20 @@
1-
# textrank-js
2-
TextRank algorithm implementation in Javascript
1+
textrank-js
2+
===========
3+
4+
TextRank is an algorithm for Text Summarization, by Rada Mihalcea & Paul Tarau. This code here is based on their paper "TextRank: Bringing Order into Texts". I've noticed that there are many implementations out there, but this one is intended to demonstrate the algorithm without any additional baggage. I wanted to show how elegant, simple and clean it is, so here is an implementation in about ~130 lines of Javascript (ES5). It currently depends on lodash for a single function. This could easily be modified to make it dependency free if the ES6 find function exists, or by a slight code mod to that line.
5+
6+
The algorithm itself can extend to any type of graph, as they note in their paper, but I have provided two types of graphs explored in the paper: keyword extraction with an undirected graph derived from collocation, and sentence extraction using the similarity weighting on the edges in an undirected graph. There is a method in the module for each type, and once the graph has been built, the textRank function performs the algorithm on the generated graph.
7+
8+
Note this code only implements the TextRank algorithm itself, the sentences must be properly formatted upfront. I have provided example tokenization for both tasks in the tests directory, both derived from tokenizing the Wikipedia entry for "Automatic summarization", both minimally processed using a custom (very minimal) tokenizer, and OpenNLP's default models for sentence splitting and POS, and converted to JSON. As long as you get the format right that this is expecting, you should be able to use whatever library you want to preprocess. The keyword extraction builder needs the format to include POS tags since it filters the content while it is building its adjacencies. The sentence extraction builder does not require POS, but requires pre-split sentences.
9+
10+
The "tests" are not currently testing anything, but serve as demonstration code for how to run the software. Note that textRank() has a default number of iterations -- it doesnt try and test for convergence. This is just to keep it simple, it would be simple to modify to test this instead, but for now you can pass in any number you want if that default isnt suitable (see test examples).
11+
```
12+
13+
Build using Grunt:
14+
```
15+
$ npm install
16+
$ grunt
17+
18+
```
19+
20+
You are welcome to use this code for whatever nefarious purposes, but please attribute it to this implementation if you do.

index.js

Lines changed: 138 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,138 @@
1+
'use strict';
2+
if (typeof require === 'function' && typeof exports === 'object' && typeof module === 'object') {
3+
var _ = require('lodash');
4+
}
5+
function textRank(V, niter, dampening) {
6+
7+
var d = dampening || 0.85;
8+
var K = niter || 200;
9+
var denom = [];
10+
var ws = [];
11+
12+
function sum(edges) {
13+
var acc = 0.0;
14+
edges.forEach(function (edge) {
15+
acc += edge.weight
16+
});
17+
return acc;
18+
}
19+
20+
function accum(i) {
21+
var sum = 0.0;
22+
V[i].in.forEach(function (v_j) {
23+
var j = v_j.index;
24+
var v_ji = _.find(V[j].out, function (x) {
25+
return x.index == i;
26+
});
27+
sum += (v_ji ? (v_ji.weight / denom[j] * ws[j].score) : 0.);
28+
});
29+
return sum;
30+
}
31+
32+
V.forEach(function (v_j, j) {
33+
denom[j] = sum(v_j.out);
34+
ws[j] = {name: v_j.name, vertex: j, score: Math.random()};
35+
});
36+
for (var k = 0; k < K; ++k) {
37+
for (var i = 0; i < V.length; ++i) {
38+
var acc = accum(i);
39+
ws[i].score = (1 - d) + d * acc;
40+
}
41+
}
42+
ws.sort(function (x, y) {
43+
return (y.score - x.score)
44+
});
45+
return ws;
46+
}
47+
48+
function sentExGraph(sentences) {
49+
function sim(s1, s2) {
50+
return _.intersection(s1, s2).length / (Math.log(s1.length) + Math.log(s2.length));
51+
}
52+
53+
var V = [];
54+
for (var i = 0; i < sentences.length; ++i) {
55+
for (var j = i + 1; j < sentences.length; ++j) {
56+
var score = sim(sentences[i], sentences[j]);
57+
V[i] = V[i] || {name: sentences[i], out: [], in: []};
58+
V[j] = V[j] || {name: sentences[j], out: [], in: []};
59+
// Symmetric
60+
V[i].out.push({index: j, weight: score});
61+
V[i].in.push({index: j, weight: score});
62+
V[j].in.push({index: i, weight: score});
63+
V[j].out.push({index: i, weight: score});
64+
}
65+
}
66+
return V;
67+
}
68+
69+
function keyExGraph(text, win) {
70+
71+
var V = [];
72+
var edges = {};
73+
var sz = text.length;
74+
var winSz = win || 2;
75+
var halfN = winSz / 2.;
76+
var term2idx = {};
77+
var n = 0;
78+
79+
function addIfNotPresent(term) {
80+
if (!term2idx[term]) {
81+
term2idx[term] = n++;
82+
}
83+
return term2idx[term];
84+
}
85+
86+
for (var i = 0; i < sz; ++i) {
87+
var token = text[i];
88+
if (!token.pos.match(/^[NJ]/) && token.pos !== 'ADJ') {
89+
continue;
90+
}
91+
var minWin = Math.max(0, i - halfN);
92+
var maxWin = Math.min(sz, i + halfN);
93+
for (var j = minWin; j < maxWin; ++j) {
94+
if (i == j) {
95+
continue;
96+
}
97+
var other = text[j];
98+
if (!other.pos.match(/^[NJ]/) && other.pos !== 'ADJ') {
99+
continue;
100+
}
101+
var edge = [token.term, other.term];
102+
edge.sort();
103+
edges[edge] = 1;
104+
}
105+
}
106+
var vertices = [];
107+
Object.keys(edges).forEach(function (edge) {
108+
vertices = vertices.concat(edge.split(','));
109+
});
110+
vertices.forEach(function (v_i) {
111+
var i, j;
112+
for (var e in edges) {
113+
var edge = e.split(',');
114+
var thisFirst;
115+
if (edge[0] === v_i) {
116+
thisFirst = edge;
117+
}
118+
else if (edge[1] === v_i) {
119+
thisFirst = [edge[1], edge[0]];
120+
}
121+
else {
122+
continue;
123+
}
124+
i = addIfNotPresent(thisFirst[0]);
125+
j = addIfNotPresent(thisFirst[1]);
126+
V[i] = V[i] || {name: thisFirst[0], out: [], in: []};
127+
V[i].out.push({index: j, weight: 1});
128+
V[i].in.push({index: j, weight: 1});
129+
}
130+
});
131+
return V;
132+
}
133+
134+
if (typeof module !== 'undefined' && module.exports) {
135+
module.exports.textRank = textRank;
136+
module.exports.keyExGraph = keyExGraph;
137+
module.exports.sentExGraph = sentExGraph;
138+
}

package.json

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
{
2+
"name": "textrank",
3+
"main": "./index.js",
4+
"version": "0.0.1",
5+
"description": "TextRank implementation in Javascript",
6+
"scripts": {
7+
"test": "mocha test --recursive"
8+
},
9+
"repository": {
10+
"type": "git",
11+
"url": "https://github.com/dpressel/textrank-js.git"
12+
},
13+
"author": "Dan Pressel",
14+
"dependencies": {
15+
},
16+
"devDependencies": {
17+
"lodash": "3.0.0",
18+
"chai": "~1.7.2",
19+
"should": "~3.1.3",
20+
"mocha": "~1.13.0",
21+
"matchdep": "~0.1.1",
22+
"grunt": "~0.4.0",
23+
"grunt-contrib-uglify": "~0.1.1",
24+
"grunt-contrib-jshint": "~0.1.1",
25+
"grunt-contrib-connect": "0.1.2",
26+
"grunt-contrib-clean": "0.4.0",
27+
"grunt-mocha-test": "~0.9.0"
28+
}
29+
}
30+

0 commit comments

Comments
 (0)