Skip to content

Commit

Permalink
also handle special tokens that are already in vocab
Browse files Browse the repository at this point in the history
Signed-off-by: Alexandros Koumparoulis <[email protected]>
  • Loading branch information
akoumpa committed Jan 11, 2025
1 parent 983dc66 commit 818bc5e
Showing 1 changed file with 17 additions and 1 deletion.
18 changes: 17 additions & 1 deletion nemo/collections/common/tokenizers/sentencepiece_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@ def __init__(
self.id_to_special_token = {}
self.trim_spm_separator_after_special_token = trim_spm_separator_after_special_token
self.spm_separator_id = self.tokenizer.piece_to_id(spm_separator)
self.spm_separator = spm_separator

if special_tokens:
if not self.legacy:
Expand Down Expand Up @@ -104,7 +105,19 @@ def text_to_tokens(self, text):
next_token = min(indices, key=indices.get)
next_idx = idx + indices[next_token]

tokens.extend(self.tokenizer.encode_as_pieces(text[idx:next_idx]))
tok = self.tokenizer.encode_as_pieces(text[idx:next_idx])
# Chat-templates insert a space between a special token and first word (e.g.
# "[INST] who") which is tokenized as <inst-id> <space-id> <who-id> instead of
# <inst-id> <who-id>.
if (
self.trim_spm_separator_after_special_token
and len(tokens) > 0
and tokens[-1] in self.special_token_to_id
and len(tok) > 0
and tok[0] == self.spm_separator
):
tok.pop(0)
tokens.extend(tok)
tokens.append(next_token)
idx = next_idx + len(next_token)

Expand Down Expand Up @@ -268,6 +281,9 @@ def add_special_tokens(self, special_tokens):
self.special_token_to_id[token] = self.vocab_size
self.id_to_special_token[self.vocab_size] = token
self.vocab_size += 1
elif self.tokenizer.piece_to_id(token) != self.tokenizer.unk_id():
self.special_token_to_id[token] = self.tokenizer.piece_to_id(token)
self.id_to_special_token[self.special_token_to_id[token]] = token
else:
raise ValueError("Expected special_tokens to be a list or a dict " + str(type(special_tokens)))

Expand Down

0 comments on commit 818bc5e

Please sign in to comment.