name-ethnicity-classifier · theopfr · Apr 20, 2024 · Apr 4, 2024 · Apr 4, 2024 · Apr 7, 2024
diff --git a/.gitignore b/.gitignore
@@ -2,4 +2,6 @@
 .env
 .pytest_cache/
 **/__pycache__/
-.fuse_*
+.fuse_*
+model_configurations/
+!model_configurations/test_model
diff --git a/.vscode/launch.json b/.vscode/launch.json
@@ -15,5 +15,12 @@
             ],
             "justMyCode": true
         },
+        {
+            "name": "Python: Current File",
+            "type": "python",
+            "request": "launch",
+            "program": "${file}",
+            "justMyCode": true
+        },
     ]
 }
diff --git a/README.md b/README.md
@@ -37,6 +37,10 @@ POSTGRES_PASSWORD=pwd123
 POSTGRES_DB=n2e-db
 POSTGRES_PORT=5432
 ADMINER_PORT=6060
+
+# Variables for inference
+BATCH_SIZE=64
+MAX_NAMES=1000
 ```
 
 ## 🏃 Run locally:

diff --git a/requirements.txt b/requirements.txt
@@ -9,4 +9,6 @@ SQLAlchemy==2.0.23
 Flask-SQLAlchemy==3.1.1
 email-validator==2.1.0.post1
 Flask-JWT-Extended==4.6.0
-bcrypt==4.1.2
+bcrypt==4.1.2
+torch==1.12.1
+numpy==1.26.4
diff --git a/src/app.py b/src/app.py
@@ -11,6 +11,7 @@
 from routes.model_routes import model_routes
 from routes.authentication_routes import authentication_routes
 from routes.util_routes import util_routes
+from routes.inference_routes import inference_routes
 
 load_dotenv()
 
@@ -40,6 +41,7 @@ def index():
 app.logger.setLevel(logging.INFO)
 app.register_blueprint(authentication_routes)
 app.register_blueprint(model_routes)
+app.register_blueprint(inference_routes)
 app.register_blueprint(util_routes)
 
 

diff --git a/src/errors.py b/src/errors.py
@@ -5,3 +5,9 @@ def __init__(self, error_code: str, message: str, status_code: int):
         self.message = message
         self.status_code = status_code
 
+
+class InferenceError(Exception):
+    def __init__(self, error_code: str, message: str):
+        self.error_code = error_code
+        self.message = message
+
diff --git a/src/inference/inference.py b/src/inference/inference.py
@@ -0,0 +1,197 @@
+
+import torch
+import torch.utils.data
+from torch.nn.utils.rnn import pad_sequence
+import numpy as np
+import string
+import os
+import unicodedata
+import re
+from dotenv import load_dotenv
+from errors import InferenceError
+from inference.model import ConvLSTM as Model
+from utils import load_json
+
+
+device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+
+
+def replace_special_chars(name: str) -> str:
+    """
+    Replaces all apostrophe letters with their base letters and removes all other special characters incl. numbers
+    :param str name: name
+    :return str: normalized name
+    """
+
+    name = u"{}".format(name)
+    name = unicodedata.normalize("NFD", name).encode("ascii", "ignore").decode("utf-8")
+    name = re.sub("[^A-Za-z -]+", "", name)
+
+    return name
+
+
+def preprocess_names(names: list=[str], batch_size: int=128) -> torch.tensor:
+    """
+    Creates a pytorch-usable input-batch from a list of string-names
+    :param list names: list of names (strings)
+    :param int batch_size: batch-size for the forward pass
+    :return torch.tensor: preprocessed names (to tensors, padded, encoded)
+    """
+
+    sample_batch = []
+    for name in names:
+        # normalize name to only latin characters
+        name = replace_special_chars(name)
+
+        # create index-representation from string name, ie: "joe" -> [10, 15, 5], indices go from 1 ("a") to 28 ("-")
+        alphabet = list(string.ascii_lowercase.strip()) + [" ", "-"]
+        int_name = []
+        for char in name:
+            int_name.append(alphabet.index(char.lower()) + 1)
+
+        name = torch.tensor(int_name)
+        sample_batch.append(name)
+
+    padded_batch = pad_sequence(sample_batch, batch_first=True)
+
+    padded_to = list(padded_batch.size())[1]
+    padded_batch = padded_batch.reshape(len(sample_batch), padded_to, 1).to(device=device)
+
+    if padded_batch.shape[0] == 1 or batch_size == padded_batch.shape[0]:
+        padded_batch = padded_batch.unsqueeze(0)
+    else:
+        padded_batch = torch.split(padded_batch, batch_size)
+
+    return padded_batch
+
+
+def classify_names(input_batch: torch.tensor, model_config: dict, classes: dict, get_distribution: bool=False) -> str:
+    """ load model and predict preprocessed name
+
+    :param torch.tensor input_batch: input-batch
+    :param str model_path: path to saved model-paramters
+    :param dict classes: a dictionary containing all countries with their class-number
+    :param get_distribution: Wether to return the entire distribution of the predicted nationalities
+    :return str: predicted ethnicities
+    """
+
+    # prepare model (map model-file content from gpu to cpu if necessary)
+    model = Model(
+        class_amount=model_config["amount-classes"], 
+        embedding_size=model_config["embedding-size"],
+        hidden_size=model_config["hidden-size"],
+        layers=model_config["rnn-layers"],
+        kernel_size=model_config["cnn-parameters"][1],
+        channels=model_config["cnn-parameters"][2]
+    ).to(device=device)
+
+
+    model_path = model_config["model-file"]
+
+    if device != "cuda:0":
+        model.load_state_dict(torch.load(model_path, map_location={"cuda:0": "cpu"}))
+    else:
+        model.load_state_dict(torch.load(model_path))
+
+    model = model.eval()
+
+    total_predicted_ethncitities = []
+
+    # classify names and store results
+    for batch in input_batch:
+        predictions = model(batch.float()).cpu().detach().numpy()
+
+        # get entire ethnicity confidence distribution for each name
+        if get_distribution:
+            prediction_result = get_ethnicity_distributions(predictions, classes=classes)
+        # get the ethnicity with the highest confidence for each name
+        else:
+            prediction_result = get_ethnicity_predictions(predictions, classes=classes)
+
+        total_predicted_ethncitities.extend(prediction_result)
+
+    return total_predicted_ethncitities
+
+
+def get_ethnicity_predictions(predictions: np.array, classes: list) -> list[str]:
+    """
+    Collects the highest confidence ethnicity for every prediction in a batch.
+    For example if the model classified a batch of two names into eithher "german" or "greek":
+    > [(german, 0.9), (greek, 0.8)]
+
+    :param predictions: The output predictions of the model
+    :param classes: A list containing all the classes which a model can classify
+    :return: A list containing the predicted ethnicity and confidence score for each name
+    """
+
+    predicted_ethnicites = []
+    for prediction in predictions:
+        prediction_idx = list(prediction).index(max(prediction))
+        ethnicity = classes[prediction_idx]
+        predicted_ethnicites.append((ethnicity, round(100 * float(np.exp(max(prediction))), 3)))
+
+    return predicted_ethnicites
+
+
+def get_ethnicity_distributions(predictions: np.array, classes: list) -> list[dict]:
+    """
+    Collects the entire output distribution for every predictions in a batch
+    For example if the model classified a batch of two names into eithher "german" or "greek":
+    > [{german: 0.9, greek: 0.1}, {german: 0.2, greek: 0.8}]
+
+    :param predictions: The output predictions of the model
+    :param classes: A list containing all the classes which a model can classify
+    :return: A list containing an output distribution for each name
+    """
+
+    predicted_ethnicites = []
+
+    for prediction in predictions:
+        ethnicity_distribution = {}
+        for idx, ethnicity in enumerate(classes):
+            confidence = round(100 * float(np.exp(prediction[idx])), 3)
+            ethnicity_distribution[ethnicity] = confidence
+
+        predicted_ethnicites.append(ethnicity_distribution)
+
+    return predicted_ethnicites
+
+
+def predict(model_id: str, names: list[str], get_distribution: bool=False) -> list[str]:
+    """
+    Preprocesses and predicts the names.
+    :param model_id: The ID of the model to use
+    :param names: A list of all names which are to classify
+    :param get_distribution: Wether to return the entire distribution of the predicted nationalities
+    :return: List of the predicted nationalities (and optionally the entire output distr.)
+    """
+
+    load_dotenv()
+
+    MAX_NAMES = int(os.getenv("MAX_NAMES"))
+    BATCH_SIZE = int(os.getenv("BATCH_SIZE"))
+
+    model_config = load_json(f"model_configurations/{model_id}/config.json")
+    classes = load_json(f"model_configurations/{model_id}/dataset/nationalities.json")
+    model_file = f"model_configurations/{model_id}/model.pt"
+
+    if len(names) > MAX_NAMES:
+        raise InferenceError(
+            error_code="TOO_MANY_NAMES",
+            message=f"Too many names (maximum {MAX_NAMES}.")
+
+    # preprocess inputs
+    input_batch = preprocess_names(names=names, batch_size=BATCH_SIZE)
+
+    model_config = {
+        "model-file": model_file,
+        "amount-classes": len(classes),
+        "embedding-size": model_config["embedding-size"],
+        "hidden-size": model_config["hidden-size"],
+        "rnn-layers": model_config["rnn-layers"],
+        "cnn-parameters": model_config["cnn-parameters"]
+    }
+
+    # predict ethnicities
+    return classify_names(input_batch, model_config, classes, get_distribution)
+
diff --git a/src/inference/model.py b/src/inference/model.py
@@ -0,0 +1,52 @@
+
+import torch
+import torch.nn as nn
+
+
+device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+
+
+class ConvLSTM(nn.Module):
+    def __init__(self, class_amount: int=0, embedding_size: int=64, hidden_size: int=10, layers: int=1, dropout_chance: float=0.5, kernel_size: int=3, channels: list=[32, 64, 128]):
+        super(ConvLSTM, self).__init__()
+
+        self.embedding_size = embedding_size
+        self.hidden_size = hidden_size
+        self.layers = layers
+        self.dropout_chance = dropout_chance
+
+        self.kernel_size = kernel_size
+        self.channels = channels
+
+        self.embedder = nn.Embedding(29, self.embedding_size)
+
+        self.conv1 = nn.Sequential(nn.Conv1d(self.embedding_size, self.channels[0], kernel_size=self.kernel_size),
+                                   nn.ReLU())
+
+        self.lstm = nn.LSTM(input_size=self.channels[-1], hidden_size=self.hidden_size, num_layers=self.layers, batch_first=True)
+
+        self.dropout = nn.Dropout2d(p=self.dropout_chance)
+        self.linear1 = nn.Linear(self.hidden_size, class_amount)
+        self.logSoftmax = nn.LogSoftmax(dim=1)
+
+    def forward(self, x: torch.tensor):
+        # Embedding
+        x = self.embedder(x.type(torch.LongTensor).to(device=device))
+        x = x.squeeze(2).transpose(1, 2)
+
+        # 1-dimensional CNN
+        x = self.conv1(x)
+        x = x.transpose(1, 2)
+
+        # LSTM
+        x, _ = self.lstm(x)
+        x = x[:, -1]
+
+        # Feed-Forward Layer
+        x = self.dropout(x)
+        x = self.linear1(x)
+        x = self.logSoftmax(x)
+
+        return x
+
+
diff --git a/src/routes/authentication_routes.py b/src/routes/authentication_routes.py
@@ -6,7 +6,7 @@
 from schemas.user_schema import LoginSchema, SignupSchema
 from services.user_services import add_user, check_user_login
 from utils import success_response, error_response
-from errors import CustomError, CustomError
+from errors import CustomError
 
 authentication_routes = Blueprint("authentication", __name__)