Skip to content

Commit

Permalink
Skeleton files
Browse files Browse the repository at this point in the history
  • Loading branch information
nreimers committed Nov 21, 2016
1 parent e9a7b14 commit 7999f3d
Show file tree
Hide file tree
Showing 7 changed files with 512 additions and 43 deletions.
14 changes: 6 additions & 8 deletions 2016-11_Seminar/Session 1 - SENNA/code for NER/NER.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,16 +68,14 @@
#####################################




# Create the train and predict_labels function
n_in = train_tokens.shape[1]
n_hidden = numHiddenUnits
n_out = len(label2Idx)


x = T.imatrix('x') # the data, one word+context per row
y = T.ivector('y') # the labels are presented as 1D vector of [int] labels


words = Sequential()
words.add(Embedding(output_dim=wordEmbeddings.shape[1], input_dim=wordEmbeddings.shape[0], input_length=n_in, weights=[wordEmbeddings], trainable=False))
words.add(Flatten())
Expand All @@ -96,16 +94,16 @@
# Use Adam optimizer
model.compile(loss='categorical_crossentropy', optimizer='adam')

# Train_y is a 1-dimensional vector containing the index of the label
# With np_utils.to_categorical we map it to a 1 hot matrix
train_y_cat = np_utils.to_categorical(train_y, n_out)


print train_tokens.shape[0], ' train samples'
print train_tokens.shape[1], ' train dimension'
print test_tokens.shape[0], ' test samples'

# Train_y is a 1-dimensional vector containing the index of the label
# With np_utils.to_categorical we map it to a 1 hot matrix
train_y_cat = np_utils.to_categorical(train_y, n_out)
dev_y_cat = np_utils.to_categorical(dev_y, n_out)


##################################
#
Expand Down
17 changes: 8 additions & 9 deletions 2016-11_Seminar/Session 1 - SENNA/code for POS/POS.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@



numHiddenUnits = 100
numHiddenUnits = 20


f = gzip.open('pkl/embeddings.pkl.gz', 'rb')
Expand All @@ -68,16 +68,14 @@
#####################################




# Create the train and predict_labels function
n_in = train_tokens.shape[1]
n_hidden = numHiddenUnits
n_out = len(label2Idx)


x = T.imatrix('x') # the data, one word+context per row
y = T.ivector('y') # the labels are presented as 1D vector of [int] labels


words = Sequential()
words.add(Embedding(output_dim=wordEmbeddings.shape[1], input_dim=wordEmbeddings.shape[0], input_length=n_in, weights=[wordEmbeddings], trainable=False))
words.add(Flatten())
Expand All @@ -96,16 +94,17 @@
# Use Adam optimizer
model.compile(loss='categorical_crossentropy', optimizer='adam')

# Train_y is a 1-dimensional vector containing the index of the label
# With np_utils.to_categorical we map it to a 1 hot matrix
train_y_cat = np_utils.to_categorical(train_y, n_out)


print train_tokens.shape[0], ' train samples'
print train_tokens.shape[1], ' train dimension'
print test_tokens.shape[0], ' test samples'

# Train_y is a 1-dimensional vector containing the index of the label
# With np_utils.to_categorical we map it to a 1 hot matrix
train_y_cat = np_utils.to_categorical(train_y, n_out)
dev_y_cat = np_utils.to_categorical(dev_y, n_out)



##################################
#
Expand Down
87 changes: 87 additions & 0 deletions 2016-11_Seminar/Session 1 - SENNA/code for POS/POS_Skeleton.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
# -*- coding: utf-8 -*-
"""
This is an example code for a POS tagger using the SENNA architecture (Collobert et al.) and Keras.
Baseline:
NLTK Uni-/Bi-/Trigram Tagger: 91.33%
Performance after 4 epochs:
Dev-Accuracy: 96.55%
Test-Accuracy: 96.51%
@author: Nils Reimers
Code was written & tested with:
- Python 2.7
- Theano 0.8.1
- Keras 1.1.1
"""
import numpy as np
import theano
import theano.tensor as T


import time
import gzip
import cPickle as pkl


import keras
from keras.models import Sequential
from keras.layers.core import Dense, Flatten, Merge
from keras.optimizers import SGD
from keras.utils import np_utils
from keras.layers.embeddings import Embedding





numHiddenUnits = 20


f = gzip.open('pkl/embeddings.pkl.gz', 'rb')
embeddings = pkl.load(f)
f.close()

label2Idx = embeddings['label2Idx']
wordEmbeddings = embeddings['wordEmbeddings']
caseEmbeddings = embeddings['caseEmbeddings']

#Inverse label mapping
idx2Label = {v: k for k, v in label2Idx.items()}

f = gzip.open('pkl/data.pkl.gz', 'rb')
train_tokens, train_case, train_y = pkl.load(f)
dev_tokens, dev_case, dev_y = pkl.load(f)
test_tokens, test_case, test_y = pkl.load(f)
f.close()

#####################################
#
# Create the Network
#
#####################################


#
# ::::: Create your network here !! :::::::::
#


##################################
#
# Training of the Network
#
##################################


#
# :::: Put your train code here :::::::::
#


137 changes: 137 additions & 0 deletions 2016-11_Seminar/Session 2 - Sentence CNN/code/cnn_Skeleton.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
"""
This implementation is a Convolutional Neural Network for sentence classification.
It uses the same preprocessing of Kim et al., EMNLP 2014, 'Convolutional Neural Networks for Sentence Classification ' (https://github.com/yoonkim/CNN_sentence).
Preprocessing Option 1:
- Unzip the kim_et_al_preprocessed.p.gz
Preprocessing Option 2:
1. Download Kim et al. source code at Convolutional Neural Networks for Sentence Classification
2. Download the word2vec embeddings GoogleNews-vectors-negative300.bin from https://code.google.com/archive/p/word2vec/
3. Run the preprocessing of Kim et al: 'python process_data.py path' where where path points to the word2vec binary file (i.e. GoogleNews-vectors-negative300.bin file). This will create a pickle object called mr.p in the same folder, which contains the dataset in the right format.
4. Copy the pickle object 'mr.p' and store it in this folder. Rename it to kim_et_al_preprocessed.p
Run this code:
Run this code via 'python cnn.py'.
Code was tested with:
- Python 2.7
- Theano 0.8.2
- Keras 1.1.0
Data structure:
To run this network / to run a sentence classification using CNNs, the data must be in a certain format.
The list train_sentences containts the different sentences of your training data. Each word in the training data is converted to
the according word index in the embeddings matrix. An example could look like:
[[1,6,2,1,5,12,42],
[7,23,56],
[35,76,23,64,17,97,43,62,47,65]]
Here we have three sentences, the first with 7 words, the second with 3 words and the third with 10 words.
As our network expects a matrix as input for the mini-batchs, we need to bring all sentences to the same length. This is a requirement
of Theano to run efficiently. For this we use the function 'sequence.pad_sequences', which adds 0-padding to the matrix. The list/matrix will look after the padding like this:
[[0,0,0,1,6,2,1,5,12,42],
[0,0,0,0,0,0,0,7,23,56],
[35,76,23,64,17,97,43,62,47,65]]
To make sure that the network does not interpret 0 as some word, we set the embeddings matrix (word_embeddings) such that the 0-column only contains 0. You can check this by outputting word_embeddings[0].
Our labels (y_train) are a 1-dimensional vector containing the binary label for out sentiment classification example.
"""


import numpy as np
np.random.seed(1337) # for reproducibility

from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Flatten
from keras.layers import Embedding
from keras.layers import Convolution1D, MaxPooling1D, GlobalMaxPooling1D
from keras.datasets import imdb
from keras import backend as K
import cPickle
import gzip



def wordIdxLookup(word, word_idx_map):
if word in word_idx_map:
return word_idx_map[word]


# Load the preprocessed data from Kim et al. scripts
#sentences: Reviews, dictionary with the entries
# -"y": label,
# -"text": orig_rev,
# -"num_words",
# -"split": np.random.randint(0,cv)}

#word_embeddings: Word Embeddings
#random_embeddings: Random word Embeddings
#word_idx_map: Mapping of words to indices
#vocab: Vocabulary

sentences, word_embeddings, random_embeddings, word_idx_map, vocab = cPickle.load(gzip.open("kim_et_al_preprocessed.p.gz","rb"))
print "data loaded!"



train_labels = []
train_sentences = []

test_labels = []
test_sentences = []

max_sentence_len = 0

for datum in sentences:
label = datum['y']
cv = datum['split']
words = datum['text'].split()
wordIndices = [wordIdxLookup(word, word_idx_map) for word in words]

if cv == 0: #CV=0 is our test set
test_labels.append(label)
test_sentences.append(wordIndices)
else:
train_labels.append(label)
train_sentences.append(wordIndices)

max_sentence_len = max(max_sentence_len, len(words))




y_train = np.array(train_labels)
y_test = np.array(test_labels)

X_train = sequence.pad_sequences(train_sentences, maxlen=max_sentence_len)
X_test = sequence.pad_sequences(test_sentences, maxlen=max_sentence_len)


print 'X_train shape:', X_train.shape
print 'X_test shape:', X_test.shape



# :: Create the network ::

print 'Build model...'

# set parameters:
batch_size = 32

nb_filter = 250
filter_length = 3
hidden_dims = 250
nb_epoch = 20


#
# ::::: Put your network here :::::::
#

30 changes: 4 additions & 26 deletions 2016-11_Seminar/Session 3 - Relation CNN/code/CNN.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,8 +76,6 @@
distanceModel2 = Sequential()
distanceModel2.add(Embedding(max_position, position_dims, input_length=positionTrain2.shape[1]))



wordModel = Sequential()
wordModel.add(Embedding(embeddings.shape[0], embeddings.shape[1], input_length=sentenceTrain.shape[1], weights=[embeddings], trainable=False))

Expand All @@ -86,8 +84,6 @@
convModel.add(Merge([wordModel, distanceModel1, distanceModel2], mode='concat'))




convModel.add(Convolution1D(nb_filter=nb_filter,
filter_length=filter_length,
border_mode='same',
Expand Down Expand Up @@ -133,25 +129,16 @@ def getPrecision(pred_test, yTest, targetLabel):

return float(correctTargetLabelCount) / targetLabelCount

for epoch in xrange(nb_epoch):

model.fit([sentenceTrain, positionTrain1, positionTrain2], train_y_cat, batch_size=batch_size, verbose=True,nb_epoch=1)

for epoch in xrange(nb_epoch):
model.fit([sentenceTrain, positionTrain1, positionTrain2], train_y_cat, batch_size=batch_size, verbose=True,nb_epoch=1)
pred_test = model.predict_classes([sentenceTest, positionTest1, positionTest2], verbose=False)



dctLabels = np.sum(pred_test)
totalDCTLabels = np.sum(yTest)




acc = np.sum(pred_test == yTest) / float(len(yTest))
max_acc = max(max_acc, acc)
print "Accuracy: %.4f (max: %.4f)" % (acc, max_acc)



f1Sum = 0
f1Count = 0
Expand All @@ -163,15 +150,6 @@ def getPrecision(pred_test, yTest, targetLabel):
f1Count +=1


macroF1 = f1Sum / float(f1Count)

macroF1 = f1Sum / float(f1Count)
max_f1 = max(max_f1, macroF1)

print "Non-other Macro-Averaged F1: %.4f (max: %.4f)\n" % (macroF1, max_f1)






print "DONE"
print "Non-other Macro-Averaged F1: %.4f (max: %.4f)\n" % (macroF1, max_f1)
Loading

0 comments on commit 7999f3d

Please sign in to comment.