Skip to content
This repository was archived by the owner on Jan 15, 2024. It is now read-only.

Commit 50e5278

Browse files
[BUGFIX] Fix vocab determinism in py35 (#1166) (#1167)
1 parent 6520c72 commit 50e5278

File tree

2 files changed

+17
-1
lines changed

2 files changed

+17
-1
lines changed

src/gluonnlp/vocab/vocab.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
import json
2424
import uuid
2525
import warnings
26+
import sys
2627
from typing import Dict, Hashable, List, Optional
2728

2829
from mxnet import nd
@@ -37,6 +38,9 @@
3738
_DEPR_EOS = object()
3839

3940

41+
def _is_py35():
42+
return sys.version_info[0] == 3 and sys.version_info[1] == 5
43+
4044
class Vocab:
4145
"""Indexing and embedding attachment for text tokens.
4246
@@ -222,7 +226,10 @@ def __init__(self, counter: Optional[Counter] = None, max_size: Optional[int] =
222226

223227
# Handle special tokens
224228
special_tokens = []
225-
for special_token_name, special_token in kwargs.items():
229+
special_iter = kwargs.items()
230+
if _is_py35():
231+
special_iter = sorted(special_iter)
232+
for special_token_name, special_token in special_iter:
226233
# Test if kwarg specifies a special token
227234
if not special_token_name.endswith('_token'):
228235
raise ValueError('{} is invalid. Only keyword arguments '

tests/unittest/test_vocab_embed.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1457,3 +1457,12 @@ def test_vocab_remapped_unknown_token_idx(unknown_token, padding_token, eos_toke
14571457

14581458
v = Vocab(token_to_idx={unknown_token: 1})
14591459
assert v['UNKNOWNWORD'] == 1
1460+
1461+
def test_vocab_consistency():
1462+
v0 = nlp.Vocab({'a': 1}, mask_token='[MASK]', sep_token='[SEP]',
1463+
cls_token='[CLS]')
1464+
v1 = nlp.Vocab({'a': 1}, mask_token='[MASK]', sep_token='[SEP]',
1465+
cls_token='[CLS]')
1466+
assert v0[v0.mask_token] == v1[v1.mask_token]
1467+
assert v0[v0.sep_token] == v1[v1.sep_token]
1468+
assert v0[v0.cls_token] == v1[v1.cls_token]

0 commit comments

Comments
 (0)