Skip to content

Commit

Permalink
Encodings
Browse files Browse the repository at this point in the history
  • Loading branch information
UKP committed Jul 11, 2017
1 parent 55ee01d commit d052047
Show file tree
Hide file tree
Showing 10 changed files with 375 additions and 13 deletions.
111 changes: 111 additions & 0 deletions 2017-07_Seminar/Session 1 - SENNA/code for NER/NER_template.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
# -*- coding: utf-8 -*-
"""
This is an example for performing sequence tagging with Keras.
We use the GermEval 2014 NER dataset (German) and implement the SENNA architecture (Collobert et al., NLP (almost) from scratch).
The code can easily be changed to any other sequence tagging task.
Performance after 10 epochs (GermEval 2014 German NER):
Development F1-score: 70.3%
Test F1-score: 69.9%
Code was written & tested with:
- Python 2.7 & Python 3.6
- Theano 0.9.0 and tensorflow 1.2.1
- Keras 2.0.5
@author: Nils Reimers, www.deeplearning4nlp.com
"""
from __future__ import print_function
import numpy as np
import time
import gzip

import sys
if (sys.version_info > (3, 0)):
import pickle as pkl
else: #Python 2.7 imports
import cPickle as pkl


import keras
from keras.models import Model
from keras.layers import Input, Dense, Dropout, Activation, Flatten, concatenate
from keras.layers import Embedding

import BIOF1Validation



numHiddenUnits = 100


f = gzip.open('pkl/embeddings.pkl.gz', 'rb')
embeddings = pkl.load(f)
f.close()

label2Idx = embeddings['label2Idx']
wordEmbeddings = embeddings['wordEmbeddings']
caseEmbeddings = embeddings['caseEmbeddings']

#Inverse label mapping
idx2Label = {v: k for k, v in label2Idx.items()}

f = gzip.open('pkl/data.pkl.gz', 'rb')
train_tokens, train_case, train_y = pkl.load(f)
dev_tokens, dev_case, dev_y = pkl.load(f)
test_tokens, test_case, test_y = pkl.load(f)
f.close()

#####################################
#
# Create the Network
#
#####################################



print(train_tokens.shape[0], ' train samples')
print(train_tokens.shape[1], ' train dimension')
print(test_tokens.shape[0], ' test samples')



# !!TODO!!
# Add your Keras network here
# !! / TODO !!




##################################
#
# Training of the Network
#
##################################



number_of_epochs = 10
minibatch_size = 128
print("%d epochs" % number_of_epochs)


def predict_classes(prediction):
return prediction.argmax(axis=-1)

for epoch in range(number_of_epochs):
print("\n------------- Epoch %d ------------" % (epoch+1))
model.fit([train_tokens, train_case], train_y, epochs=1, batch_size=minibatch_size, verbose=True, shuffle=True)


# Compute precision, recall, F1 on dev & test data
pre_dev, rec_dev, f1_dev = BIOF1Validation.compute_f1(predict_classes(model.predict([dev_tokens, dev_case])), dev_y, idx2Label)
pre_test, rec_test, f1_test = BIOF1Validation.compute_f1(predict_classes(model.predict([test_tokens, test_case])), test_y, idx2Label)

print("%d. epoch: F1 on dev: %f, F1 on test: %f" % (epoch+1, f1_dev, f1_test))


8 changes: 4 additions & 4 deletions 2017-07_Seminar/Session 1 - SENNA/code for NER/preprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ def readFile(filepath, tokenPosition, tagPosition):
sentences = []
sentence = []

for line in open(filepath):
for line in open(filepath, encoding="utf8"):
line = line.strip()

if len(line) == 0 or line[0] == '#':
Expand Down Expand Up @@ -190,12 +190,12 @@ def getCasing(word, caseLookup):

if embeddingsPath.endswith('.gz'):
try:
fEmbeddings = gzip.open(embeddingsPath, "rt")
fEmbeddings = gzip.open(embeddingsPath, "rt", encoding="utf8")
except ValueError:
# Workaround for Python 2.7 under Windows
fEmbeddings = gzip.open(embeddingsPath, "r")
fEmbeddings = gzip.open(embeddingsPath, "r", encoding="utf8")
else:
fEmbeddings = open(embeddingsPath)
fEmbeddings = open(embeddingsPath, encoding="utf8")

for line in fEmbeddings:
split = line.strip().split(" ")
Expand Down
109 changes: 109 additions & 0 deletions 2017-07_Seminar/Session 1 - SENNA/code for POS/POS_template.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
# -*- coding: utf-8 -*-
"""
This is an example for performing sequence tagging with Keras.
We use the Universal Dependencies dataset (English) and implement the SENNA architecture (Collobert et al., NLP (almost) from scratch).
The code can easily be changed to any other sequence tagging task.
Performance after 10 epochs (Universal Dependencies POS English):
Dev-Accuracy: 96.29%
Test-Accuracy: 96.32%
Code was written & tested with:
- Python 2.7 & Python 3.6
- Theano 0.9.0 and tensorflow 1.2.1
- Keras 2.0.5
@author: Nils Reimers, www.deeplearning4nlp.com
"""
from __future__ import print_function
import numpy as np
import time
import gzip

import sys
if (sys.version_info > (3, 0)):
import pickle as pkl
else: #Python 2.7 imports
import cPickle as pkl


import keras
from keras.models import Model
from keras.layers import Input, Dense, Dropout, Activation, Flatten, concatenate
from keras.layers import Embedding




numHiddenUnits = 100


f = gzip.open('pkl/embeddings.pkl.gz', 'rb')
embeddings = pkl.load(f)
f.close()

label2Idx = embeddings['label2Idx']
wordEmbeddings = embeddings['wordEmbeddings']
caseEmbeddings = embeddings['caseEmbeddings']

#Inverse label mapping
idx2Label = {v: k for k, v in label2Idx.items()}

f = gzip.open('pkl/data.pkl.gz', 'rb')
train_tokens, train_case, train_y = pkl.load(f)
dev_tokens, dev_case, dev_y = pkl.load(f)
test_tokens, test_case, test_y = pkl.load(f)
f.close()

#####################################
#
# Create the Network
#
#####################################



print(train_tokens.shape[0], ' train samples')
print(train_tokens.shape[1], ' train dimension')
print(test_tokens.shape[0], ' test samples')

# !!TODO!!
# Add your Keras network here
# !! / TODO !!



##################################
#
# Training of the Network
#
##################################



number_of_epochs = 10
minibatch_size = 128
print("%d epochs" % number_of_epochs)


def predict_classes(prediction):
return prediction.argmax(axis=-1)

for epoch in range(number_of_epochs):
print("\n------------- Epoch %d ------------" % (epoch+1))
model.fit([train_tokens, train_case], train_y, epochs=1, batch_size=minibatch_size, verbose=True, shuffle=True)


dev_pred = predict_classes(model.predict([dev_tokens, dev_case]))
dev_acc = np.sum(dev_pred == dev_y) / float(len(dev_y))
print("Dev-Accuracy: %.2f" % (dev_acc*100))

test_pred = predict_classes(model.predict([test_tokens, test_case]))
test_acc = np.sum(test_pred == test_y) / float(len(test_y))
print("Test-Accuracy: %.2f" % (test_acc*100))



6 changes: 3 additions & 3 deletions 2017-07_Seminar/Session 1 - SENNA/code for POS/preprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -194,12 +194,12 @@ def getCasing(word, caseLookup):

if embeddingsPath.endswith('.gz'):
try:
fEmbeddings = gzip.open(embeddingsPath, "rt")
fEmbeddings = gzip.open(embeddingsPath, "rt", encoding="utf8")
except ValueError:
# Workaround for Python 2.7 under Windows
fEmbeddings = gzip.open(embeddingsPath, "r")
fEmbeddings = gzip.open(embeddingsPath, "r", encoding="utf8")
else:
fEmbeddings = open(embeddingsPath)
fEmbeddings = open(embeddingsPath, encoding="utf8")

for line in fEmbeddings:
split = line.strip().split(" ")
Expand Down
6 changes: 3 additions & 3 deletions 2017-07_Seminar/Session 2 - Sentence CNN/code/preprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,12 +126,12 @@ def readFile(filepath):
# :: Load the pre-trained embeddings file ::
if embeddingsPath.endswith('.gz'):
try:
fEmbeddings = gzip.open(embeddingsPath, "rt", encoding="utf8")
fEmbeddings = gzip.open(embeddingsPath, "rt", encoding="utf8")
except ValueError:
# Workaround for Python 2.7 under Windows
fEmbeddings = gzip.open(embeddingsPath, "r", encoding="utf8")
fEmbeddings = gzip.open(embeddingsPath, "r", encoding="utf8")
else:
fEmbeddings = open(embeddingsPath, encoding="utf8")
fEmbeddings = open(embeddingsPath, encoding="utf8")

print("Load pre-trained embeddings file")
for line in fEmbeddings:
Expand Down
Loading

0 comments on commit d052047

Please sign in to comment.