dpressel
diff --git a/‎.jshintrc
Lines changed: 24 additions & 0 deletions b/‎.jshintrc
Lines changed: 24 additions & 0 deletions
diff --git a/‎Gruntfile.js
Lines changed: 53 additions & 0 deletions b/‎Gruntfile.js
Lines changed: 53 additions & 0 deletions
diff --git a/‎README.md
Lines changed: 20 additions & 2 deletions b/‎README.md
Lines changed: 20 additions & 2 deletions
diff --git a/‎index.js
Lines changed: 138 additions & 0 deletions b/‎index.js
Lines changed: 138 additions & 0 deletions
diff --git a/‎package.json
Lines changed: 30 additions & 0 deletions b/‎package.json
Lines changed: 30 additions & 0 deletions
@@ -0,0 +1,24 @@
+{
+    "node": true,
+    "browser": false,
+    "es5": true,
+    "esnext": true,
+    "bitwise": true,
+    "camelcase": true,
+    "curly": true,
+    "eqeqeq": true,
+    "immed": true,
+    "indent": 4,
+    "latedef": true,
+    "newcap": true,
+    "noarg": true,
+    "quotmark": "single",
+    "regexp": true,
+    "undef": true,
+    "unused": true,
+    "strict": true,
+    "trailing": true,
+    "smarttabs": true,
+    "white": false,
+    "globals": { "window": false }
+}
@@ -0,0 +1,53 @@
+'use strict';
+
+module.exports = function (grunt) {
+    // load all grunt tasks
+    require('matchdep').filterDev('grunt-*').forEach(grunt.loadNpmTasks);
+
+    grunt.initConfig({
+        clean: {
+            dist: ['.tmp', 'dist/*'],
+            server: '.tmp'
+        },
+        uglify: {
+            my_target: {
+                files: {
+                    'dist/textrank.min.js': ['index.js']
+                }
+            }
+        },
+        jshint: {
+            options: {
+                jshintrc: '.jshintrc'
+            },
+            all: [
+                'lib/*.js'
+            ]
+        },
+        mochaTest: {
+            test: {
+                options: {
+                    reporter: 'spec'
+                },
+                src: ['test/*.js']
+            }
+        }
+    });
+
+    grunt.registerTask('test', [
+        'clean',
+        'mochaTest'
+    ]);
+
+    grunt.registerTask('build', [
+        'clean:dist',
+        'uglify'
+    ]);
+
+    grunt.registerTask('default', [
+        'jshint',
+        'test',
+        'build'
+    ]);
+};
+
@@ -1,2 +1,20 @@
-# textrank-js
-TextRank algorithm implementation in Javascript
+textrank-js
+===========
+
+TextRank is an algorithm for Text Summarization, by Rada Mihalcea & Paul Tarau.  This code here is based on their paper "TextRank: Bringing Order into Texts".  I've noticed that there are many implementations out there, but this one is intended to demonstrate the algorithm without any additional baggage.  I wanted to show how elegant, simple and clean it is, so here is an implementation in about ~130 lines of Javascript (ES5).  It currently depends on lodash for a single function.  This could easily be modified to make it dependency free if the ES6 find function exists, or by a slight code mod to that line.
+
+The algorithm itself can extend to any type of graph, as they note in their paper, but I have provided two types of graphs explored in the paper: keyword extraction with an undirected graph derived from collocation, and sentence extraction using the similarity weighting on the edges in an undirected graph.  There is a method in the module for each type, and once the graph has been built, the textRank function performs the algorithm on the generated graph.
+
+Note this code only implements the TextRank algorithm itself, the sentences must be properly formatted upfront.  I have provided example tokenization for both tasks in the tests directory, both derived from tokenizing the Wikipedia entry for "Automatic summarization", both minimally processed using a custom (very minimal) tokenizer, and OpenNLP's default models for sentence splitting and POS, and converted to JSON.  As long as you get the format right that this is expecting, you should be able to use whatever library you want to preprocess.  The keyword extraction builder needs the format to include POS tags since it filters the content while it is building its adjacencies.  The sentence extraction builder does not require POS, but requires pre-split sentences.
+
+The "tests" are not currently testing anything, but serve as demonstration code for how to run the software.  Note that textRank() has a default number of iterations -- it doesnt try and test for convergence.  This is just to keep it simple, it would be simple to modify to test this instead, but for now you can pass in any number you want if that default isnt suitable (see test examples).
+```
+
+Build using Grunt:
+```
+$ npm install
+$ grunt
+
+```
+
+You are welcome to use this code for whatever nefarious purposes, but please attribute it to this implementation if you do.
@@ -0,0 +1,138 @@
+'use strict';
+if (typeof require === 'function' && typeof exports === 'object' && typeof module === 'object') {
+    var _ = require('lodash');
+}
+function textRank(V, niter, dampening) {
+
+    var d = dampening || 0.85;
+    var K = niter || 200;
+    var denom = [];
+    var ws = [];
+
+    function sum(edges) {
+        var acc = 0.0;
+        edges.forEach(function (edge) {
+            acc += edge.weight
+        });
+        return acc;
+    }
+
+    function accum(i) {
+        var sum = 0.0;
+        V[i].in.forEach(function (v_j) {
+            var j = v_j.index;
+            var v_ji = _.find(V[j].out, function (x) {
+                return x.index == i;
+            });
+            sum += (v_ji ? (v_ji.weight / denom[j] * ws[j].score) : 0.);
+        });
+        return sum;
+    }
+
+    V.forEach(function (v_j, j) {
+        denom[j] = sum(v_j.out);
+        ws[j] = {name: v_j.name, vertex: j, score: Math.random()};
+    });
+    for (var k = 0; k < K; ++k) {
+        for (var i = 0; i < V.length; ++i) {
+            var acc = accum(i);
+            ws[i].score = (1 - d) + d * acc;
+        }
+    }
+    ws.sort(function (x, y) {
+        return (y.score - x.score)
+    });
+    return ws;
+}
+
+function sentExGraph(sentences) {
+    function sim(s1, s2) {
+        return _.intersection(s1, s2).length / (Math.log(s1.length) + Math.log(s2.length));
+    }
+
+    var V = [];
+    for (var i = 0; i < sentences.length; ++i) {
+        for (var j = i + 1; j < sentences.length; ++j) {
+            var score = sim(sentences[i], sentences[j]);
+            V[i] = V[i] || {name: sentences[i], out: [], in: []};
+            V[j] = V[j] || {name: sentences[j], out: [], in: []};
+            // Symmetric
+            V[i].out.push({index: j, weight: score});
+            V[i].in.push({index: j, weight: score});
+            V[j].in.push({index: i, weight: score});
+            V[j].out.push({index: i, weight: score});
+        }
+    }
+    return V;
+}
+
+function keyExGraph(text, win) {
+
+    var V = [];
+    var edges = {};
+    var sz = text.length;
+    var winSz = win || 2;
+    var halfN = winSz / 2.;
+    var term2idx = {};
+    var n = 0;
+
+    function addIfNotPresent(term) {
+        if (!term2idx[term]) {
+            term2idx[term] = n++;
+        }
+        return term2idx[term];
+    }
+
+    for (var i = 0; i < sz; ++i) {
+        var token = text[i];
+        if (!token.pos.match(/^[NJ]/) && token.pos !== 'ADJ') {
+            continue;
+        }
+        var minWin = Math.max(0, i - halfN);
+        var maxWin = Math.min(sz, i + halfN);
+        for (var j = minWin; j < maxWin; ++j) {
+            if (i == j) {
+                continue;
+            }
+            var other = text[j];
+            if (!other.pos.match(/^[NJ]/) && other.pos !== 'ADJ') {
+                continue;
+            }
+            var edge = [token.term, other.term];
+            edge.sort();
+            edges[edge] = 1;
+        }
+    }
+    var vertices = [];
+    Object.keys(edges).forEach(function (edge) {
+        vertices = vertices.concat(edge.split(','));
+    });
+    vertices.forEach(function (v_i) {
+        var i, j;
+        for (var e in edges) {
+            var edge = e.split(',');
+            var thisFirst;
+            if (edge[0] === v_i) {
+                thisFirst = edge;
+            }
+            else if (edge[1] === v_i) {
+                thisFirst = [edge[1], edge[0]];
+            }
+            else {
+                continue;
+            }
+            i = addIfNotPresent(thisFirst[0]);
+            j = addIfNotPresent(thisFirst[1]);
+            V[i] = V[i] || {name: thisFirst[0], out: [], in: []};
+            V[i].out.push({index: j, weight: 1});
+            V[i].in.push({index: j, weight: 1});
+        }
+    });
+    return V;
+}
+
+if (typeof module !== 'undefined' && module.exports) {
+    module.exports.textRank = textRank;
+    module.exports.keyExGraph = keyExGraph;
+    module.exports.sentExGraph = sentExGraph;
+}
@@ -0,0 +1,30 @@
+{
+  "name": "textrank",
+  "main": "./index.js",
+  "version": "0.0.1",
+  "description": "TextRank implementation in Javascript",
+  "scripts": {
+    "test": "mocha test --recursive"
+  },
+  "repository": {
+    "type": "git",
+    "url": "https://github.com/dpressel/textrank-js.git"
+  },
+  "author": "Dan Pressel",
+  "dependencies": {
+  },
+  "devDependencies": {
+    "lodash": "3.0.0",
+    "chai": "~1.7.2",
+    "should": "~3.1.3",
+    "mocha": "~1.13.0",
+    "matchdep": "~0.1.1",
+    "grunt": "~0.4.0",
+    "grunt-contrib-uglify": "~0.1.1",
+    "grunt-contrib-jshint": "~0.1.1",
+    "grunt-contrib-connect": "0.1.2",
+    "grunt-contrib-clean": "0.4.0",
+    "grunt-mocha-test": "~0.9.0"
+  }
+}
+