-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtokenizer.js
71 lines (60 loc) · 1.62 KB
/
tokenizer.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
class Tokenizer {
constructor() {
this.vocab = new Set();
this.wordToIndex = {};
this.indexToWord = {};
this.nextIndex = 0;
}
fit(texts) {
for (const text of texts) {
for (const char of text) {
if (!this.vocab.has(char)) {
this.vocab.add(char);
this.wordToIndex[char] = this.nextIndex;
this.indexToWord[this.nextIndex] = char;
this.nextIndex++;
}
}
}
}
encode(text) {
return text.split('').map(char => {
if (this.wordToIndex[char] !== undefined) {
return this.wordToIndex[char];
} else {
return this.wordToIndex['<UNK>'] || 0;
}
});
}
decode(sequence) {
return sequence.map(index => this.indexToWord[index] || '<UNK>').join('');
}
getVocabSize() {
return Object.keys(this.wordToIndex).length;
}
toJson() {
return JSON.stringify({
vocab: Array.from(this.vocab),
wordToIndex: this.wordToIndex,
indexToWord: this.indexToWord,
nextIndex: this.nextIndex
});
}
static fromJson(jsonString) {
const data = JSON.parse(jsonString);
console.log("Parsed JSON data:", data);
if (typeof data.vocab !== 'object' || data.vocab === null) {
throw new TypeError("Expected vocab to be an object");
}
const tokenizer = new Tokenizer();
tokenizer.wordToIndex = data.wordToIndex;
tokenizer.indexToWord = data.indexToWord;
tokenizer.nextIndex = data.nextIndex;
// Populate the vocab set
for (const char in data.wordToIndex) {
tokenizer.vocab.add(char);
}
return tokenizer;
}
}
module.exports = { Tokenizer };