-
Notifications
You must be signed in to change notification settings - Fork 35
/
classifier.js
107 lines (90 loc) · 2.99 KB
/
classifier.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
const process = require('process');
const fs = require('fs');
const os = require('os');
const path = require('path');
const NaiveBayes = require('@ladjs/naivebayes');
const pMap = require('p-map');
const { readDirDeep } = require('read-dir-deep');
const CLASSIFIER_IGNORES = require('./classifier-ignores.js');
const MBOX_PATTERNS = require('./mbox-patterns.js');
const VOCABULARY_LIMIT = require('./vocabulary-limit.js');
const replacements = require('./replacements.js');
const SpamScanner = require('.');
const concurrency = os.cpus().length;
// simply delete the classifier.json to retrain from scratch
let json;
try {
json = require('./classifier.json');
console.log('re-training with existing classifier');
} catch (err) {
console.error(err);
console.log('training new classifier');
}
function tokenizer(tokens) {
return tokens;
}
let classifier;
if (json) {
classifier = NaiveBayes.fromJson(json, VOCABULARY_LIMIT);
classifier.tokenizer = tokenizer;
} else {
classifier = new NaiveBayes({ tokenizer, vocabularyLimit: VOCABULARY_LIMIT });
}
if (
typeof process.env.SPAM_CATEGORY !== 'string' ||
!['ham', 'spam'].includes(process.env.SPAM_CATEGORY)
)
throw new Error('SPAM_CATEGORY environment variable missing');
if (typeof process.env.SCAN_DIRECTORY !== 'string')
throw new Error('SCAN_DIRECTORY environment variable missing');
const scanner = new SpamScanner({
replacements,
classifier: true
});
async function mapper(source) {
try {
const { tokens } = await scanner.getTokensAndMailFromSource(source);
if (tokens.length === 0) return;
// to bias against false positives we can (at least for now)
// take the token count for ham and double it (duplicate it)
if (process.env.SPAM_CATEGORY === 'ham') {
const { length } = tokens;
// NOTE: concat is slower than push so we use push
for (let i = 0; i < length; i++) {
tokens.push(tokens[i]);
}
}
classifier.learn(tokens, process.env.SPAM_CATEGORY);
} catch (err) {
console.log('source error', source);
console.error(err);
}
}
(async () => {
// read directory for all files (i/o)
console.time('sources');
const dir = path.resolve(process.env.SCAN_DIRECTORY);
const sources = await readDirDeep(dir, {
ignore: [...CLASSIFIER_IGNORES, ...MBOX_PATTERNS]
});
console.timeEnd('sources');
// process all token sets, this is an array of arrays
// for each source it returns an array of stemmed tokens
console.time('tokenSets');
await pMap(sources, mapper, { concurrency });
console.timeEnd('tokenSets');
console.time('writing classifier.json');
fs.writeFileSync(
path.join(__dirname, 'classifier.json'),
classifier.toJson()
);
console.timeEnd('writing classifier.json');
console.time('writing replacements.json');
fs.writeFileSync(
path.join(__dirname, 'replacements.json'),
JSON.stringify(replacements, null, 2)
);
console.timeEnd('writing replacements.json');
// eslint-disable-next-line unicorn/no-process-exit
process.exit(0);
})();