Encodings

UKPLab · Jul 11, 2017 · d052047 · d052047
1 parent 55ee01d
commit d052047
Show file tree

Hide file tree

Showing 10 changed files with 375 additions and 13 deletions.
diff --git a/...nar/Session 1 - SENNA/code for NER/NER.py → ...on 1 - SENNA/code for NER/NER_solution.py b/...nar/Session 1 - SENNA/code for NER/NER.py → ...on 1 - SENNA/code for NER/NER_solution.py
diff --git a/2017-07_Seminar/Session 1 - SENNA/code for NER/NER_template.py b/2017-07_Seminar/Session 1 - SENNA/code for NER/NER_template.py
@@ -0,0 +1,111 @@
+# -*- coding: utf-8 -*-
+"""
+This is an example for performing sequence tagging with Keras.
+
+We use the GermEval 2014 NER dataset (German) and implement the SENNA architecture (Collobert et al., NLP (almost) from scratch).
+
+The code can easily be changed to any other sequence tagging task.
+
+Performance after 10 epochs (GermEval 2014 German NER):
+Development F1-score: 70.3%
+Test F1-score: 69.9%
+
+
+
+Code was written & tested with:
+- Python 2.7 & Python 3.6
+- Theano 0.9.0 and tensorflow 1.2.1
+- Keras 2.0.5
+
+@author: Nils Reimers, www.deeplearning4nlp.com
+"""
+from __future__ import print_function
+import numpy as np
+import time
+import gzip
+
+import sys
+if (sys.version_info > (3, 0)):
+    import pickle as pkl
+else: #Python 2.7 imports
+    import cPickle as pkl
+
+
+import keras
+from keras.models import Model
+from keras.layers import Input, Dense, Dropout, Activation, Flatten, concatenate
+from keras.layers import Embedding
+
+import BIOF1Validation
+
+
+
+numHiddenUnits = 100
+
+
+f = gzip.open('pkl/embeddings.pkl.gz', 'rb')
+embeddings = pkl.load(f)
+f.close()
+
+label2Idx = embeddings['label2Idx']
+wordEmbeddings = embeddings['wordEmbeddings']
+caseEmbeddings = embeddings['caseEmbeddings']
+
+#Inverse label mapping
+idx2Label = {v: k for k, v in label2Idx.items()}
+
+f = gzip.open('pkl/data.pkl.gz', 'rb')
+train_tokens, train_case, train_y = pkl.load(f)
+dev_tokens, dev_case, dev_y = pkl.load(f)
+test_tokens, test_case, test_y = pkl.load(f)
+f.close()
+
+#####################################
+#
+# Create the  Network
+#
+#####################################
+
+
+
+print(train_tokens.shape[0], ' train samples')
+print(train_tokens.shape[1], ' train dimension')
+print(test_tokens.shape[0], ' test samples')
+
+
+
+# !!TODO!!
+# Add your Keras network here
+# !! / TODO !!
+
+
+
+
+##################################
+#
+# Training of the Network
+#
+##################################
+
+
+
+number_of_epochs = 10
+minibatch_size = 128
+print("%d epochs" % number_of_epochs)
+
+
+def predict_classes(prediction):
+ return prediction.argmax(axis=-1)
+
+for epoch in range(number_of_epochs):
+    print("\n------------- Epoch %d ------------" % (epoch+1))
+    model.fit([train_tokens, train_case], train_y, epochs=1, batch_size=minibatch_size, verbose=True, shuffle=True)   
+
+
+    # Compute precision, recall, F1 on dev & test data
+    pre_dev, rec_dev, f1_dev = BIOF1Validation.compute_f1(predict_classes(model.predict([dev_tokens, dev_case])), dev_y, idx2Label)
+    pre_test, rec_test, f1_test = BIOF1Validation.compute_f1(predict_classes(model.predict([test_tokens, test_case])), test_y, idx2Label)
+
+    print("%d. epoch: F1 on dev: %f, F1 on test: %f" % (epoch+1, f1_dev, f1_test))
+
+
diff --git a/2017-07_Seminar/Session 1 - SENNA/code for NER/preprocess.py b/2017-07_Seminar/Session 1 - SENNA/code for NER/preprocess.py
@@ -92,7 +92,7 @@ def readFile(filepath, tokenPosition, tagPosition):
     sentences = []
     sentence = []
 
-    for line in open(filepath):
+    for line in open(filepath, encoding="utf8"):
         line = line.strip()
 
         if len(line) == 0 or line[0] == '#':
@@ -190,12 +190,12 @@ def getCasing(word, caseLookup):
 
 if embeddingsPath.endswith('.gz'):
     try:
-        fEmbeddings = gzip.open(embeddingsPath, "rt")
+        fEmbeddings = gzip.open(embeddingsPath, "rt", encoding="utf8")
     except ValueError:
         # Workaround for Python 2.7 under Windows
-        fEmbeddings = gzip.open(embeddingsPath, "r")
+        fEmbeddings = gzip.open(embeddingsPath, "r", encoding="utf8")
 else:
-    fEmbeddings = open(embeddingsPath)
+    fEmbeddings = open(embeddingsPath, encoding="utf8")
 
 for line in fEmbeddings:
     split = line.strip().split(" ")

diff --git a/...nar/Session 1 - SENNA/code for POS/POS.py → ...on 1 - SENNA/code for POS/POS_solution.py b/...nar/Session 1 - SENNA/code for POS/POS.py → ...on 1 - SENNA/code for POS/POS_solution.py
diff --git a/2017-07_Seminar/Session 1 - SENNA/code for POS/POS_template.py b/2017-07_Seminar/Session 1 - SENNA/code for POS/POS_template.py
@@ -0,0 +1,109 @@
+# -*- coding: utf-8 -*-
+"""
+This is an example for performing sequence tagging with Keras.
+
+We use the Universal Dependencies dataset (English) and implement the SENNA architecture (Collobert et al., NLP (almost) from scratch).
+
+The code can easily be changed to any other sequence tagging task.
+
+Performance after 10 epochs (Universal Dependencies POS English):
+Dev-Accuracy: 96.29%
+Test-Accuracy: 96.32%
+
+
+Code was written & tested with:
+- Python 2.7 & Python 3.6
+- Theano 0.9.0 and tensorflow 1.2.1
+- Keras 2.0.5
+
+@author: Nils Reimers, www.deeplearning4nlp.com
+"""
+from __future__ import print_function
+import numpy as np
+import time
+import gzip
+
+import sys
+if (sys.version_info > (3, 0)):
+    import pickle as pkl
+else: #Python 2.7 imports
+    import cPickle as pkl
+
+
+import keras
+from keras.models import Model
+from keras.layers import Input, Dense, Dropout, Activation, Flatten, concatenate
+from keras.layers import Embedding
+
+
+
+
+numHiddenUnits = 100
+
+
+f = gzip.open('pkl/embeddings.pkl.gz', 'rb')
+embeddings = pkl.load(f)
+f.close()
+
+label2Idx = embeddings['label2Idx']
+wordEmbeddings = embeddings['wordEmbeddings']
+caseEmbeddings = embeddings['caseEmbeddings']
+
+#Inverse label mapping
+idx2Label = {v: k for k, v in label2Idx.items()}
+
+f = gzip.open('pkl/data.pkl.gz', 'rb')
+train_tokens, train_case, train_y = pkl.load(f)
+dev_tokens, dev_case, dev_y = pkl.load(f)
+test_tokens, test_case, test_y = pkl.load(f)
+f.close()
+
+#####################################
+#
+# Create the  Network
+#
+#####################################
+
+
+
+print(train_tokens.shape[0], ' train samples')
+print(train_tokens.shape[1], ' train dimension')
+print(test_tokens.shape[0], ' test samples')
+
+# !!TODO!!
+# Add your Keras network here
+# !! / TODO !!
+
+
+
+##################################
+#
+# Training of the Network
+#
+##################################
+
+
+
+number_of_epochs = 10
+minibatch_size = 128
+print("%d epochs" % number_of_epochs)
+
+
+def predict_classes(prediction):
+ return prediction.argmax(axis=-1)
+
+for epoch in range(number_of_epochs):
+    print("\n------------- Epoch %d ------------" % (epoch+1))
+    model.fit([train_tokens, train_case], train_y, epochs=1, batch_size=minibatch_size, verbose=True, shuffle=True)   
+
+
+    dev_pred = predict_classes(model.predict([dev_tokens, dev_case]))
+    dev_acc = np.sum(dev_pred == dev_y) / float(len(dev_y))
+    print("Dev-Accuracy: %.2f" % (dev_acc*100))
+
+    test_pred = predict_classes(model.predict([test_tokens, test_case]))
+    test_acc = np.sum(test_pred == test_y) / float(len(test_y))
+    print("Test-Accuracy: %.2f" % (test_acc*100))
+
+
+
diff --git a/2017-07_Seminar/Session 1 - SENNA/code for POS/preprocess.py b/2017-07_Seminar/Session 1 - SENNA/code for POS/preprocess.py
@@ -194,12 +194,12 @@ def getCasing(word, caseLookup):
 
 if embeddingsPath.endswith('.gz'):
     try:
-        fEmbeddings = gzip.open(embeddingsPath, "rt")
+        fEmbeddings = gzip.open(embeddingsPath, "rt", encoding="utf8")
     except ValueError:
         # Workaround for Python 2.7 under Windows
-        fEmbeddings = gzip.open(embeddingsPath, "r")
+        fEmbeddings = gzip.open(embeddingsPath, "r", encoding="utf8")
 else:
-    fEmbeddings = open(embeddingsPath)
+    fEmbeddings = open(embeddingsPath, encoding="utf8")
 
 for line in fEmbeddings:
     split = line.strip().split(" ")

diff --git a/2017-07_Seminar/Session 2 - Sentence CNN/code/preprocess.py b/2017-07_Seminar/Session 2 - Sentence CNN/code/preprocess.py
@@ -126,12 +126,12 @@ def readFile(filepath):
 # :: Load the pre-trained embeddings file ::
 if embeddingsPath.endswith('.gz'):
     try:
-        fEmbeddings = gzip.open(embeddingsPath, "rt",  encoding="utf8")
+        fEmbeddings = gzip.open(embeddingsPath, "rt", encoding="utf8")
     except ValueError:
         # Workaround for Python 2.7 under Windows
-        fEmbeddings = gzip.open(embeddingsPath, "r",  encoding="utf8")
+        fEmbeddings = gzip.open(embeddingsPath, "r", encoding="utf8")
 else:
-    fEmbeddings = open(embeddingsPath,  encoding="utf8")
+    fEmbeddings = open(embeddingsPath, encoding="utf8")
 
 print("Load pre-trained embeddings file")
 for line in fEmbeddings:

diff --git a/...inar/Session 3 - Relation CNN/code/CNN.py → ...ion 3 - Relation CNN/code/cnn_solution.py b/...inar/Session 3 - Relation CNN/code/CNN.py → ...ion 3 - Relation CNN/code/cnn_solution.py