diff --git a/src/classifier.js b/src/classifier.js index 073f74a..afc0776 100644 --- a/src/classifier.js +++ b/src/classifier.js @@ -68,7 +68,7 @@ class Classifier { tokens = this.vectorize(tokens) } - // Set up an empty entry for the label if it does not exist + // Set up an empty entry for the label if it does not exist if (typeof this._model.data[label] === 'undefined') { this._model.data[label] = {} } @@ -148,7 +148,7 @@ class Classifier { /** * Split a string into an array of lowercase words, with all non-letter characters removed - * + * * @param {string} input * @return {Array} */ @@ -179,7 +179,7 @@ class Classifier { if (!(words instanceof Array)) { throw new Error('input must be either a string or Array') } - + if (this._model.nGramMax < this._model.nGramMin) { throw new Error('Invalid nGramMin/nGramMax combination in model config') } @@ -190,22 +190,28 @@ class Classifier { // based on the models configured min/max values words.forEach((word, index) => { let sequence = '' - - words.slice(index).forEach(nextWord => { + let tokenCount = 0 + let nextWord + + // Create n-gram(s) of between nGramMin and nGramMax words from segment starting at (index) + // Increment the occurrence counter (tokens[sequence]) for each n-gram created + // Stop looping once we have nGramMax words (or reach the end of the segment) + let segment = words.slice(index) + while (tokenCount < this._model.nGramMax && tokenCount < segment.length) { + nextWord = segment[tokenCount] sequence += sequence ? (' ' + nextWord) : nextWord - let tokenCount = sequence.split(' ').length + tokenCount++ + if(tokenCount >= this._model.nGramMin && tokenCount <= this._model.nGramMax) { + if (typeof tokens[sequence] === 'undefined') { + tokens[sequence] = 0 + } - if (tokenCount < this._model.nGramMin || tokenCount > this._model.nGramMax) { - return + ++tokens[sequence] } + } + }) - if (typeof tokens[sequence] === 'undefined') { - tokens[sequence] = 0 - } - ++tokens[sequence] - }) - }) return tokens } diff --git a/test/classifier.js b/test/classifier.js index a8b3791..b5573da 100644 --- a/test/classifier.js +++ b/test/classifier.js @@ -74,7 +74,7 @@ describe('Classifier', () => { const classifier = new Classifier() classifier.model.nGramMin = 2 - + expect(() => classifier.tokenize('Hello world!')).to.throw(Error) }) @@ -129,6 +129,16 @@ describe('Classifier', () => { }) }) + it('should create a unigrams for the space character from an array of characters including a space', () => { + const classifier = new Classifier() + + expect(classifier.tokenize([' ','a','b'])).to.eql({ + ' ': 1, + 'a': 1, + 'b': 1 + }) + }) + it('should increment the occurrence of the duplicate tokens', () => { const classifier = new Classifier() @@ -195,7 +205,7 @@ describe('Classifier', () => { expect(() => classifier.train('test', [])).to.throw(Error) }) - + it('should add tokens to the vocabulary (if not configured to false)', () => { const classifier = new Classifier() @@ -254,7 +264,7 @@ describe('Classifier', () => { expect(classifier.train('hello world', 'test')).to.equal(classifier) }) }) - + describe('cosineSimilarity', () => { it('should throw an error if v1 is not an object literal', () => { const classifier = new Classifier()