|
| 1 | +""" |
| 2 | +This implementation is a Convolutional Neural Network for sentence classification. |
| 3 | +
|
| 4 | +It uses the same preprocessing of Kim et al., EMNLP 2014, 'Convolutional Neural Networks for Sentence Classification ' (https://github.com/yoonkim/CNN_sentence). |
| 5 | +
|
| 6 | +Run the code: |
| 7 | +1) Run 'python preprocess.py'. This will preprocess.py the dataset and create the necessary pickle files in the pkl/ folder. |
| 8 | +2) Run this code via: python cnn.py |
| 9 | +
|
| 10 | +
|
| 11 | +Code was tested with: |
| 12 | +- Python 2.7 & Python 3.6 |
| 13 | +- Theano 0.9.0 & TensorFlow 1.2.1 |
| 14 | +- Keras 2.0.5 |
| 15 | +
|
| 16 | +Data structure: |
| 17 | +To run this network / to run a sentence classification using CNNs, the data must be in a certain format. |
| 18 | +The list train_sentences containts the different sentences of your training data. Each word in the training data is converted to |
| 19 | +the according word index in the embeddings matrix. An example could look like: |
| 20 | +[[1,6,2,1,5,12,42], |
| 21 | + [7,23,56], |
| 22 | + [35,76,23,64,17,97,43,62,47,65]] |
| 23 | + |
| 24 | +Here we have three sentences, the first with 7 words, the second with 3 words and the third with 10 words. |
| 25 | +As our network expects a matrix as input for the mini-batchs, we need to bring all sentences to the same length. This is a requirement |
| 26 | +of Theano to run efficiently. For this we use the function 'sequence.pad_sequences', which adds 0-padding to the matrix. The list/matrix will look after the padding like this: |
| 27 | +[[0,0,0,1,6,2,1,5,12,42], |
| 28 | + [0,0,0,0,0,0,0,7,23,56], |
| 29 | + [35,76,23,64,17,97,43,62,47,65]] |
| 30 | + |
| 31 | +To make sure that the network does not interpret 0 as some word, we set the embeddings matrix (word_embeddings) such that the 0-column only contains 0. You can check this by outputting word_embeddings[0]. |
| 32 | +
|
| 33 | +
|
| 34 | +Our labels (y_train) are a 1-dimensional vector containing the binary label for out sentiment classification example. |
| 35 | +
|
| 36 | +This code uses the functional API of Keras: https://keras.io/getting-started/functional-api-guide/ |
| 37 | +
|
| 38 | +It implements roughly the network proposed by Kim et al., Convolutional Neural Networks for Sentence Classification, using convolutions |
| 39 | +with several filter lengths. |
| 40 | +
|
| 41 | +Performance after 5 epochs: |
| 42 | +Dev-Accuracy: 79.09% (loss: 0.5046) |
| 43 | +Test-Accuracy: 77.44% (loss: 0.5163) |
| 44 | +""" |
| 45 | +from __future__ import print_function |
| 46 | +import numpy as np |
| 47 | +np.random.seed(1337) # for reproducibility |
| 48 | + |
| 49 | + |
| 50 | +import gzip |
| 51 | +import sys |
| 52 | +if (sys.version_info > (3, 0)): |
| 53 | + import pickle as pkl |
| 54 | +else: #Python 2.7 imports |
| 55 | + import cPickle as pkl |
| 56 | + |
| 57 | +import keras |
| 58 | +from keras.models import Model |
| 59 | +from keras.layers import Input, Dense, Dropout, Activation, Flatten, concatenate |
| 60 | +from keras.layers import Embedding |
| 61 | +from keras.layers import Convolution1D, MaxPooling1D, GlobalMaxPooling1D |
| 62 | +from keras.regularizers import Regularizer |
| 63 | +from keras.preprocessing import sequence |
| 64 | + |
| 65 | + |
| 66 | + |
| 67 | +def wordIdxLookup(word, word_idx_map): |
| 68 | + if word in word_idx_map: |
| 69 | + return word_idx_map[word] |
| 70 | + |
| 71 | + |
| 72 | + |
| 73 | + |
| 74 | +data = pkl.load(gzip.open("pkl/data.pkl.gz","rb")) |
| 75 | +print("data loaded!") |
| 76 | + |
| 77 | + |
| 78 | +train_labels = data['train']['labels'] |
| 79 | +train_sentences = data['train']['sentences'] |
| 80 | + |
| 81 | +dev_labels = data['dev']['labels'] |
| 82 | +dev_sentences = data['dev']['sentences'] |
| 83 | + |
| 84 | +test_labels = data['test']['labels'] |
| 85 | +test_sentences = data['test']['sentences'] |
| 86 | + |
| 87 | +word_embeddings = data['wordEmbeddings'] |
| 88 | + |
| 89 | +# :: Find the longest sentence in our dataset :: |
| 90 | +max_sentence_len = 0 |
| 91 | +for sentence in train_sentences + dev_sentences + test_sentences: |
| 92 | + max_sentence_len = max(len(sentence), max_sentence_len) |
| 93 | + |
| 94 | +print("Longest sentence: %d" % max_sentence_len) |
| 95 | + |
| 96 | + |
| 97 | + |
| 98 | +y_train = np.array(train_labels) |
| 99 | +y_dev = np.array(dev_labels) |
| 100 | +y_test = np.array(test_labels) |
| 101 | + |
| 102 | +X_train = sequence.pad_sequences(train_sentences, maxlen=max_sentence_len) |
| 103 | +X_dev = sequence.pad_sequences(dev_sentences, maxlen=max_sentence_len) |
| 104 | +X_test = sequence.pad_sequences(test_sentences, maxlen=max_sentence_len) |
| 105 | + |
| 106 | + |
| 107 | +print('X_train shape:', X_train.shape) |
| 108 | +print('X_dev shape:', X_dev.shape) |
| 109 | +print('X_test shape:', X_test.shape) |
| 110 | + |
| 111 | + |
| 112 | + |
| 113 | +# :: Create the network :: |
| 114 | + |
| 115 | +print('Build model...') |
| 116 | + |
| 117 | +# set parameters: |
| 118 | +batch_size = 50 |
| 119 | + |
| 120 | +nb_filter = 50 |
| 121 | +filter_lengths = [1,2,3] |
| 122 | +hidden_dims = 100 |
| 123 | +nb_epoch = 20 |
| 124 | + |
| 125 | +##################################### |
| 126 | +# |
| 127 | +# Create the Network |
| 128 | +# |
| 129 | +##################################### |
| 130 | + |
| 131 | +# !!TODO!! |
| 132 | +# Add your Keras network here |
| 133 | +# !! / TODO !! |
| 134 | + |
| 135 | + |
| 136 | + |
| 137 | +################################## |
| 138 | +# |
| 139 | +# Training of the Network |
| 140 | +# |
| 141 | +################################## |
| 142 | + |
| 143 | +for epoch in range(nb_epoch): |
| 144 | + print("\n------------- Epoch %d ------------" % (epoch+1)) |
| 145 | + model.fit(X_train, y_train, batch_size=batch_size, epochs=1) |
| 146 | + |
| 147 | + dev_loss, dev_accuracy = model.evaluate(X_dev, y_dev, verbose=False) |
| 148 | + test_loss, test_accuracy = model.evaluate(X_test, y_test, verbose=False) |
| 149 | + |
| 150 | + |
| 151 | + print("Dev-Accuracy: %.2f%% (loss: %.4f)" % (dev_accuracy*100, dev_loss)) |
| 152 | + print("Test-Accuracy: %.2f%% (loss: %.4f)" % (test_accuracy*100, test_loss)) |
| 153 | + |
| 154 | + |
| 155 | + |
| 156 | + |
0 commit comments