Encodings

UKPLab · Jul 11, 2017 · cd78a2d · cd78a2d
1 parent 506142b
commit cd78a2d
Show file tree

Hide file tree

Showing 3 changed files with 8 additions and 28 deletions.
diff --git a/2017-07_Seminar/Session 1 - SENNA/code for POS/preprocess.py b/2017-07_Seminar/Session 1 - SENNA/code for POS/preprocess.py
@@ -191,18 +191,12 @@ def getCasing(word, caseLookup):
     else:
         print(embeddingsPath, "does not exist. Please provide pre-trained embeddings")
         exit()
-
-if embeddingsPath.endswith('.gz'):
-    try:
-        fEmbeddings = gzip.open(embeddingsPath, "rt", encoding="utf8")
-    except ValueError:
-        # Workaround for Python 2.7 under Windows
-        fEmbeddings = gzip.open(embeddingsPath, "r", encoding="utf8")
-else:
-    fEmbeddings = open(embeddingsPath, encoding="utf8")
+
+# :: Load the pre-trained embeddings file ::   
+fEmbeddings = gzip.open(embeddingsPath, "r") if embeddingsPath.endswith('.gz') else open(embeddingsPath, encoding="utf8")
 
 for line in fEmbeddings:
-    split = line.strip().split(" ")
+    split = line.decode("utf-8").strip().split(" ")
     word = split[0]
 
     if len(word2Idx) == 0: #Add padding+unknown

diff --git a/2017-07_Seminar/Session 2 - Sentence CNN/code/preprocess.py b/2017-07_Seminar/Session 2 - Sentence CNN/code/preprocess.py
@@ -124,18 +124,11 @@ def readFile(filepath):
         exit()
 
 # :: Load the pre-trained embeddings file ::
-if embeddingsPath.endswith('.gz'):
-    try:
-        fEmbeddings = gzip.open(embeddingsPath, "rt", encoding="utf8")
-    except ValueError:
-        # Workaround for Python 2.7 under Windows
-        fEmbeddings = gzip.open(embeddingsPath, "r", encoding="utf8")
-else:
-    fEmbeddings = open(embeddingsPath, encoding="utf8")
+fEmbeddings = gzip.open(embeddingsPath, "r") if embeddingsPath.endswith('.gz') else open(embeddingsPath, encoding="utf8")
 
 print("Load pre-trained embeddings file")
 for line in fEmbeddings:
-    split = line.strip().split(" ")
+    split = line.decode("utf-8").strip().split(" ")
     word = split[0]
 
     if len(word2Idx) == 0: #Add padding+unknown

diff --git a/2017-07_Seminar/Session 3 - Relation CNN/code/preprocess.py b/2017-07_Seminar/Session 3 - Relation CNN/code/preprocess.py
@@ -151,18 +151,11 @@ def getWordIdx(token, word2Idx):
         exit()
 
 # :: Load the pre-trained embeddings file ::
-if embeddingsPath.endswith('.gz'):
-    try:
-        fEmbeddings = gzip.open(embeddingsPath, "rt", encoding="utf8")
-    except ValueError:
-        # Workaround for Python 2.7 under Windows
-        fEmbeddings = gzip.open(embeddingsPath, "r", encoding="utf8")
-else:
-    fEmbeddings = open(embeddingsPath, encoding="utf8")
+fEmbeddings = gzip.open(embeddingsPath, "r") if embeddingsPath.endswith('.gz') else open(embeddingsPath, encoding="utf8")
 
 print("Load pre-trained embeddings file")
 for line in fEmbeddings:
-    split = line.strip().split(" ")
+    split = line.decode('utf-8').strip().split(" ")
     word = split[0]
 
     if len(word2Idx) == 0: #Add padding+unknown