Skeleton files

UKPLab · Nov 21, 2016 · 7999f3d · 7999f3d
1 parent e9a7b14
commit 7999f3d
Show file tree

Hide file tree

Showing 7 changed files with 512 additions and 43 deletions.
diff --git a/2016-11_Seminar/Session 1 - SENNA/code for NER/NER.py b/2016-11_Seminar/Session 1 - SENNA/code for NER/NER.py
@@ -68,16 +68,14 @@
 #####################################
 
 
+
+
 # Create the train and predict_labels function
 n_in = train_tokens.shape[1]
 n_hidden = numHiddenUnits
 n_out = len(label2Idx)
 
 
-x = T.imatrix('x')  # the data, one word+context per row
-y = T.ivector('y')  # the labels are presented as 1D vector of [int] labels
-
-
 words = Sequential()
 words.add(Embedding(output_dim=wordEmbeddings.shape[1], input_dim=wordEmbeddings.shape[0], input_length=n_in,  weights=[wordEmbeddings], trainable=False))       
 words.add(Flatten())
@@ -96,16 +94,16 @@
 # Use Adam optimizer
 model.compile(loss='categorical_crossentropy', optimizer='adam')
 
+# Train_y is a 1-dimensional vector containing the index of the label
+# With np_utils.to_categorical we map it to a 1 hot matrix
+train_y_cat = np_utils.to_categorical(train_y, n_out)
 
 
 print train_tokens.shape[0], ' train samples'
 print train_tokens.shape[1], ' train dimension'
 print test_tokens.shape[0], ' test samples'
 
-# Train_y is a 1-dimensional vector containing the index of the label
-# With np_utils.to_categorical we map it to a 1 hot matrix
-train_y_cat = np_utils.to_categorical(train_y, n_out)
-dev_y_cat = np_utils.to_categorical(dev_y, n_out)
+
 
 ##################################
 #

diff --git a/2016-11_Seminar/Session 1 - SENNA/code for POS/POS.py b/2016-11_Seminar/Session 1 - SENNA/code for POS/POS.py
@@ -41,7 +41,7 @@
 
 
 
-numHiddenUnits = 100
+numHiddenUnits = 20
 
 
 f = gzip.open('pkl/embeddings.pkl.gz', 'rb')
@@ -68,16 +68,14 @@
 #####################################
 
 
+
+
 # Create the train and predict_labels function
 n_in = train_tokens.shape[1]
 n_hidden = numHiddenUnits
 n_out = len(label2Idx)
 
 
-x = T.imatrix('x')  # the data, one word+context per row
-y = T.ivector('y')  # the labels are presented as 1D vector of [int] labels
-
-
 words = Sequential()
 words.add(Embedding(output_dim=wordEmbeddings.shape[1], input_dim=wordEmbeddings.shape[0], input_length=n_in,  weights=[wordEmbeddings], trainable=False))       
 words.add(Flatten())
@@ -96,16 +94,17 @@
 # Use Adam optimizer
 model.compile(loss='categorical_crossentropy', optimizer='adam')
 
+# Train_y is a 1-dimensional vector containing the index of the label
+# With np_utils.to_categorical we map it to a 1 hot matrix
+train_y_cat = np_utils.to_categorical(train_y, n_out)
 
 
 print train_tokens.shape[0], ' train samples'
 print train_tokens.shape[1], ' train dimension'
 print test_tokens.shape[0], ' test samples'
 
-# Train_y is a 1-dimensional vector containing the index of the label
-# With np_utils.to_categorical we map it to a 1 hot matrix
-train_y_cat = np_utils.to_categorical(train_y, n_out)
-dev_y_cat = np_utils.to_categorical(dev_y, n_out)
+
+
 
 ##################################
 #

diff --git a/2016-11_Seminar/Session 1 - SENNA/code for POS/POS_Skeleton.py b/2016-11_Seminar/Session 1 - SENNA/code for POS/POS_Skeleton.py
@@ -0,0 +1,87 @@
+# -*- coding: utf-8 -*-
+"""
+This is an example code for a POS tagger using the SENNA architecture (Collobert et al.) and Keras.
+
+
+Baseline:
+NLTK Uni-/Bi-/Trigram Tagger: 91.33%
+
+Performance after 4 epochs:
+Dev-Accuracy: 96.55%
+Test-Accuracy: 96.51%
+
+
+
+@author: Nils Reimers
+
+Code was written & tested with:
+- Python 2.7
+- Theano 0.8.1
+- Keras 1.1.1
+
+"""
+import numpy as np
+import theano
+import theano.tensor as T
+
+
+import time
+import gzip
+import cPickle as pkl
+
+
+import keras
+from keras.models import Sequential
+from keras.layers.core import Dense, Flatten, Merge
+from keras.optimizers import SGD
+from keras.utils import np_utils
+from keras.layers.embeddings import Embedding
+
+
+
+
+
+numHiddenUnits = 20
+
+
+f = gzip.open('pkl/embeddings.pkl.gz', 'rb')
+embeddings = pkl.load(f)
+f.close()
+
+label2Idx = embeddings['label2Idx']
+wordEmbeddings = embeddings['wordEmbeddings']
+caseEmbeddings = embeddings['caseEmbeddings']
+
+#Inverse label mapping
+idx2Label = {v: k for k, v in label2Idx.items()}
+
+f = gzip.open('pkl/data.pkl.gz', 'rb')
+train_tokens, train_case, train_y = pkl.load(f)
+dev_tokens, dev_case, dev_y = pkl.load(f)
+test_tokens, test_case, test_y = pkl.load(f)
+f.close()
+
+#####################################
+#
+# Create the  Network
+#
+#####################################
+
+
+#
+# ::::: Create your network here !! :::::::::
+#
+
+
+##################################
+#
+# Training of the Network
+#
+##################################
+
+
+#
+# :::: Put your train code here :::::::::
+#
+
+
diff --git a/2016-11_Seminar/Session 2 - Sentence CNN/code/cnn_Skeleton.py b/2016-11_Seminar/Session 2 - Sentence CNN/code/cnn_Skeleton.py
@@ -0,0 +1,137 @@
+"""
+This implementation is a Convolutional Neural Network for sentence classification.
+
+It uses the same preprocessing of Kim et al., EMNLP 2014, 'Convolutional Neural Networks for Sentence Classification ' (https://github.com/yoonkim/CNN_sentence).
+
+
+Preprocessing Option 1:
+- Unzip the kim_et_al_preprocessed.p.gz 
+
+Preprocessing Option 2:
+1. Download Kim et al. source code at Convolutional Neural Networks for Sentence Classification 
+2. Download the word2vec embeddings GoogleNews-vectors-negative300.bin from https://code.google.com/archive/p/word2vec/
+3. Run the preprocessing of Kim et al: 'python process_data.py path' where where path points to the word2vec binary file (i.e. GoogleNews-vectors-negative300.bin file). This will create a pickle object called mr.p in the same folder, which contains the dataset in the right format.
+4. Copy the pickle object 'mr.p' and store it in this folder. Rename it to kim_et_al_preprocessed.p
+
+Run this code:
+Run this code via 'python cnn.py'.
+
+Code was tested with:
+- Python 2.7
+- Theano 0.8.2
+- Keras 1.1.0
+
+Data structure:
+To run this network / to run a sentence classification using CNNs, the data must be in a certain format. 
+The list train_sentences containts the different sentences of your training data. Each word in the training data is converted to
+the according word index in the embeddings matrix. An example could look like:
+[[1,6,2,1,5,12,42],
+ [7,23,56],
+ [35,76,23,64,17,97,43,62,47,65]]
+ 
+Here we have three sentences, the first with 7 words, the second with 3 words and the third with 10 words. 
+As our network expects a matrix as input for the mini-batchs, we need to bring all sentences to the same length. This is a requirement 
+of Theano to run efficiently.  For this we use the function 'sequence.pad_sequences', which adds 0-padding to the matrix. The list/matrix will look after the padding like this:
+[[0,0,0,1,6,2,1,5,12,42],
+ [0,0,0,0,0,0,0,7,23,56],
+ [35,76,23,64,17,97,43,62,47,65]]
+ 
+To make sure that the network does not interpret 0 as some word, we set the embeddings matrix (word_embeddings) such that the 0-column only contains 0. You can check this by outputting word_embeddings[0].
+
+
+Our labels (y_train) are a 1-dimensional vector containing the binary label for out sentiment classification example.
+"""
+
+
+import numpy as np
+np.random.seed(1337)  # for reproducibility
+
+from keras.preprocessing import sequence
+from keras.models import Sequential
+from keras.layers import Dense, Dropout, Activation, Flatten
+from keras.layers import Embedding
+from keras.layers import Convolution1D, MaxPooling1D, GlobalMaxPooling1D
+from keras.datasets import imdb
+from keras import backend as K
+import cPickle
+import gzip
+
+
+
+def wordIdxLookup(word, word_idx_map):
+    if word in word_idx_map:
+        return word_idx_map[word]
+
+
+# Load the preprocessed data from Kim et al. scripts
+#sentences: Reviews, dictionary with the entries 
+# -"y": label, 
+# -"text": orig_rev,                             
+# -"num_words",
+# -"split": np.random.randint(0,cv)} 
+
+#word_embeddings: Word Embeddings
+#random_embeddings: Random word Embeddings
+#word_idx_map: Mapping of words to indices
+#vocab: Vocabulary
+
+sentences, word_embeddings, random_embeddings, word_idx_map, vocab = cPickle.load(gzip.open("kim_et_al_preprocessed.p.gz","rb"))
+print "data loaded!"
+
+
+
+train_labels = []
+train_sentences = []
+
+test_labels = []
+test_sentences = []
+
+max_sentence_len = 0
+
+for datum in sentences:
+    label = datum['y']
+    cv = datum['split']
+    words = datum['text'].split()    
+    wordIndices = [wordIdxLookup(word, word_idx_map) for word in words]
+
+    if cv == 0: #CV=0 is our test set
+        test_labels.append(label)
+        test_sentences.append(wordIndices)
+    else:
+        train_labels.append(label)
+        train_sentences.append(wordIndices)   
+
+    max_sentence_len = max(max_sentence_len, len(words))
+
+
+
+
+y_train = np.array(train_labels)
+y_test = np.array(test_labels)
+
+X_train = sequence.pad_sequences(train_sentences, maxlen=max_sentence_len)
+X_test = sequence.pad_sequences(test_sentences, maxlen=max_sentence_len)
+
+
+print 'X_train shape:', X_train.shape 
+print 'X_test shape:', X_test.shape 
+
+
+
+#  :: Create the network :: 
+
+print 'Build model...'
+
+# set parameters:
+batch_size = 32
+
+nb_filter = 250
+filter_length = 3
+hidden_dims = 250
+nb_epoch = 20
+
+
+#
+# ::::: Put your network here :::::::
+#
+
diff --git a/2016-11_Seminar/Session 3 - Relation CNN/code/CNN.py b/2016-11_Seminar/Session 3 - Relation CNN/code/CNN.py
@@ -76,8 +76,6 @@
 distanceModel2 = Sequential()
 distanceModel2.add(Embedding(max_position, position_dims, input_length=positionTrain2.shape[1]))
 
-
-
 wordModel = Sequential()
 wordModel.add(Embedding(embeddings.shape[0], embeddings.shape[1], input_length=sentenceTrain.shape[1], weights=[embeddings], trainable=False))
 
@@ -86,8 +84,6 @@
 convModel.add(Merge([wordModel, distanceModel1, distanceModel2], mode='concat'))
 
 
-
-
 convModel.add(Convolution1D(nb_filter=nb_filter,
                         filter_length=filter_length,
                         border_mode='same',
@@ -133,25 +129,16 @@ def getPrecision(pred_test, yTest, targetLabel):
 
     return float(correctTargetLabelCount) / targetLabelCount
 
-for epoch in xrange(nb_epoch):   
-
-    model.fit([sentenceTrain, positionTrain1, positionTrain2], train_y_cat, batch_size=batch_size, verbose=True,nb_epoch=1)
-
+for epoch in xrange(nb_epoch):       
+    model.fit([sentenceTrain, positionTrain1, positionTrain2], train_y_cat, batch_size=batch_size, verbose=True,nb_epoch=1)   
     pred_test = model.predict_classes([sentenceTest, positionTest1, positionTest2], verbose=False)
 
-
-
     dctLabels = np.sum(pred_test)
     totalDCTLabels = np.sum(yTest)
 
-
-
-
     acc =  np.sum(pred_test == yTest) / float(len(yTest))
     max_acc = max(max_acc, acc)
     print "Accuracy: %.4f (max: %.4f)" % (acc, max_acc)
-
-
 
     f1Sum = 0
     f1Count = 0
@@ -163,15 +150,6 @@ def getPrecision(pred_test, yTest, targetLabel):
         f1Count +=1    
 
 
-    macroF1 = f1Sum / float(f1Count)
-
+    macroF1 = f1Sum / float(f1Count)    
     max_f1 = max(max_f1, macroF1)
-
-    print "Non-other Macro-Averaged F1: %.4f (max: %.4f)\n" % (macroF1, max_f1)
-
-
-
-
-
-
-print "DONE"
+    print "Non-other Macro-Averaged F1: %.4f (max: %.4f)\n" % (macroF1, max_f1)