-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathw2v.py
41 lines (33 loc) · 1.38 KB
/
w2v.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
from os.path import join, exists, split
import os
import numpy as np
def load_word2vec(model_type, vocabulary_inv, num_features=300):
"""
loads Word2Vec model
Returns initial weights for embedding layer.
inputs:
model_type # GoogleNews / glove
vocabulary_inv # dict {str:int}
num_features # Word vector dimensionality
"""
model_dir = 'word2vec_models'
if model_type == 'glove':
model_name = join(model_dir, 'glove.6B.%dd.txt' % (num_features))
assert(exists(model_name))
print('Loading existing Word2Vec model (Glove.6B.%dd)' % (num_features))
# dictionary, where key is word, value is word vectors
embedding_model = {}
for line in open(model_name, 'r'):
tmp = line.strip().split()
word, vec = tmp[0], map(float, tmp[1:])
assert(len(vec) == num_features)
if word not in embedding_model:
embedding_model[word] = vec
assert(len(embedding_model) == 400000)
else:
raise ValueError('Unknown pretrain model type: %s!' % (model_type))
embedding_weights = [embedding_model[w] if w in embedding_model
else np.random.uniform(-0.25, 0.25, num_features)
for w in vocabulary_inv]
embedding_weights = np.array(embedding_weights).astype('float32')
return embedding_weights