Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adds inference functions, routes and tests. #12

Merged
merged 5 commits into from
Apr 20, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,6 @@
.env
.pytest_cache/
**/__pycache__/
.fuse_*
.fuse_*
model_configurations/
!model_configurations/test_model
7 changes: 7 additions & 0 deletions .vscode/launch.json
Original file line number Diff line number Diff line change
Expand Up @@ -15,5 +15,12 @@
],
"justMyCode": true
},
{
"name": "Python: Current File",
"type": "python",
"request": "launch",
"program": "${file}",
"justMyCode": true
},
]
}
4 changes: 4 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,10 @@ POSTGRES_PASSWORD=pwd123
POSTGRES_DB=n2e-db
POSTGRES_PORT=5432
ADMINER_PORT=6060

# Variables for inference
BATCH_SIZE=64
MAX_NAMES=1000
```

## 🏃 Run locally:
Expand Down
4 changes: 3 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -9,4 +9,6 @@ SQLAlchemy==2.0.23
Flask-SQLAlchemy==3.1.1
email-validator==2.1.0.post1
Flask-JWT-Extended==4.6.0
bcrypt==4.1.2
bcrypt==4.1.2
torch==1.12.1
numpy==1.26.4
2 changes: 2 additions & 0 deletions src/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from routes.model_routes import model_routes
from routes.authentication_routes import authentication_routes
from routes.util_routes import util_routes
from routes.inference_routes import inference_routes

load_dotenv()

Expand Down Expand Up @@ -40,6 +41,7 @@ def index():
app.logger.setLevel(logging.INFO)
app.register_blueprint(authentication_routes)
app.register_blueprint(model_routes)
app.register_blueprint(inference_routes)
app.register_blueprint(util_routes)


Expand Down
6 changes: 6 additions & 0 deletions src/errors.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,9 @@ def __init__(self, error_code: str, message: str, status_code: int):
self.message = message
self.status_code = status_code


class InferenceError(Exception):
def __init__(self, error_code: str, message: str):
self.error_code = error_code
self.message = message

197 changes: 197 additions & 0 deletions src/inference/inference.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,197 @@

import torch
import torch.utils.data
from torch.nn.utils.rnn import pad_sequence
import numpy as np
import string
import os
import unicodedata
import re
from dotenv import load_dotenv
from errors import InferenceError
from inference.model import ConvLSTM as Model
from utils import load_json


device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")


def replace_special_chars(name: str) -> str:
"""
Replaces all apostrophe letters with their base letters and removes all other special characters incl. numbers
:param str name: name
:return str: normalized name
"""

name = u"{}".format(name)
name = unicodedata.normalize("NFD", name).encode("ascii", "ignore").decode("utf-8")
name = re.sub("[^A-Za-z -]+", "", name)

return name


def preprocess_names(names: list=[str], batch_size: int=128) -> torch.tensor:
"""
Creates a pytorch-usable input-batch from a list of string-names
:param list names: list of names (strings)
:param int batch_size: batch-size for the forward pass
:return torch.tensor: preprocessed names (to tensors, padded, encoded)
"""

sample_batch = []
for name in names:
# normalize name to only latin characters
name = replace_special_chars(name)

# create index-representation from string name, ie: "joe" -> [10, 15, 5], indices go from 1 ("a") to 28 ("-")
alphabet = list(string.ascii_lowercase.strip()) + [" ", "-"]
int_name = []
for char in name:
int_name.append(alphabet.index(char.lower()) + 1)

name = torch.tensor(int_name)
sample_batch.append(name)

padded_batch = pad_sequence(sample_batch, batch_first=True)

padded_to = list(padded_batch.size())[1]
padded_batch = padded_batch.reshape(len(sample_batch), padded_to, 1).to(device=device)

if padded_batch.shape[0] == 1 or batch_size == padded_batch.shape[0]:
padded_batch = padded_batch.unsqueeze(0)
else:
padded_batch = torch.split(padded_batch, batch_size)

return padded_batch


def classify_names(input_batch: torch.tensor, model_config: dict, classes: dict, get_distribution: bool=False) -> str:
""" load model and predict preprocessed name

:param torch.tensor input_batch: input-batch
:param str model_path: path to saved model-paramters
:param dict classes: a dictionary containing all countries with their class-number
:param get_distribution: Wether to return the entire distribution of the predicted nationalities
:return str: predicted ethnicities
"""

# prepare model (map model-file content from gpu to cpu if necessary)
model = Model(
class_amount=model_config["amount-classes"],
embedding_size=model_config["embedding-size"],
hidden_size=model_config["hidden-size"],
layers=model_config["rnn-layers"],
kernel_size=model_config["cnn-parameters"][1],
channels=model_config["cnn-parameters"][2]
).to(device=device)


model_path = model_config["model-file"]

if device != "cuda:0":
model.load_state_dict(torch.load(model_path, map_location={"cuda:0": "cpu"}))
else:
model.load_state_dict(torch.load(model_path))

model = model.eval()

total_predicted_ethncitities = []

# classify names and store results
for batch in input_batch:
predictions = model(batch.float()).cpu().detach().numpy()

# get entire ethnicity confidence distribution for each name
if get_distribution:
prediction_result = get_ethnicity_distributions(predictions, classes=classes)
# get the ethnicity with the highest confidence for each name
else:
prediction_result = get_ethnicity_predictions(predictions, classes=classes)

total_predicted_ethncitities.extend(prediction_result)

return total_predicted_ethncitities


def get_ethnicity_predictions(predictions: np.array, classes: list) -> list[str]:
"""
Collects the highest confidence ethnicity for every prediction in a batch.
For example if the model classified a batch of two names into eithher "german" or "greek":
> [(german, 0.9), (greek, 0.8)]

:param predictions: The output predictions of the model
:param classes: A list containing all the classes which a model can classify
:return: A list containing the predicted ethnicity and confidence score for each name
"""

predicted_ethnicites = []
for prediction in predictions:
prediction_idx = list(prediction).index(max(prediction))
ethnicity = classes[prediction_idx]
predicted_ethnicites.append((ethnicity, round(100 * float(np.exp(max(prediction))), 3)))

return predicted_ethnicites


def get_ethnicity_distributions(predictions: np.array, classes: list) -> list[dict]:
"""
Collects the entire output distribution for every predictions in a batch
For example if the model classified a batch of two names into eithher "german" or "greek":
> [{german: 0.9, greek: 0.1}, {german: 0.2, greek: 0.8}]

:param predictions: The output predictions of the model
:param classes: A list containing all the classes which a model can classify
:return: A list containing an output distribution for each name
"""

predicted_ethnicites = []

for prediction in predictions:
ethnicity_distribution = {}
for idx, ethnicity in enumerate(classes):
confidence = round(100 * float(np.exp(prediction[idx])), 3)
ethnicity_distribution[ethnicity] = confidence

predicted_ethnicites.append(ethnicity_distribution)

return predicted_ethnicites


def predict(model_id: str, names: list[str], get_distribution: bool=False) -> list[str]:
"""
Preprocesses and predicts the names.
:param model_id: The ID of the model to use
:param names: A list of all names which are to classify
:param get_distribution: Wether to return the entire distribution of the predicted nationalities
:return: List of the predicted nationalities (and optionally the entire output distr.)
"""

load_dotenv()

MAX_NAMES = int(os.getenv("MAX_NAMES"))
BATCH_SIZE = int(os.getenv("BATCH_SIZE"))

model_config = load_json(f"model_configurations/{model_id}/config.json")
classes = load_json(f"model_configurations/{model_id}/dataset/nationalities.json")
model_file = f"model_configurations/{model_id}/model.pt"

if len(names) > MAX_NAMES:
raise InferenceError(
error_code="TOO_MANY_NAMES",
message=f"Too many names (maximum {MAX_NAMES}.")

# preprocess inputs
input_batch = preprocess_names(names=names, batch_size=BATCH_SIZE)

model_config = {
"model-file": model_file,
"amount-classes": len(classes),
"embedding-size": model_config["embedding-size"],
"hidden-size": model_config["hidden-size"],
"rnn-layers": model_config["rnn-layers"],
"cnn-parameters": model_config["cnn-parameters"]
}

# predict ethnicities
return classify_names(input_batch, model_config, classes, get_distribution)

52 changes: 52 additions & 0 deletions src/inference/model.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@

import torch
import torch.nn as nn


device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")


class ConvLSTM(nn.Module):
def __init__(self, class_amount: int=0, embedding_size: int=64, hidden_size: int=10, layers: int=1, dropout_chance: float=0.5, kernel_size: int=3, channels: list=[32, 64, 128]):
super(ConvLSTM, self).__init__()

self.embedding_size = embedding_size
self.hidden_size = hidden_size
self.layers = layers
self.dropout_chance = dropout_chance

self.kernel_size = kernel_size
self.channels = channels

self.embedder = nn.Embedding(29, self.embedding_size)

self.conv1 = nn.Sequential(nn.Conv1d(self.embedding_size, self.channels[0], kernel_size=self.kernel_size),
nn.ReLU())

self.lstm = nn.LSTM(input_size=self.channels[-1], hidden_size=self.hidden_size, num_layers=self.layers, batch_first=True)

self.dropout = nn.Dropout2d(p=self.dropout_chance)
self.linear1 = nn.Linear(self.hidden_size, class_amount)
self.logSoftmax = nn.LogSoftmax(dim=1)

def forward(self, x: torch.tensor):
# Embedding
x = self.embedder(x.type(torch.LongTensor).to(device=device))
x = x.squeeze(2).transpose(1, 2)

# 1-dimensional CNN
x = self.conv1(x)
x = x.transpose(1, 2)

# LSTM
x, _ = self.lstm(x)
x = x[:, -1]

# Feed-Forward Layer
x = self.dropout(x)
x = self.linear1(x)
x = self.logSoftmax(x)

return x


2 changes: 1 addition & 1 deletion src/routes/authentication_routes.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from schemas.user_schema import LoginSchema, SignupSchema
from services.user_services import add_user, check_user_login
from utils import success_response, error_response
from errors import CustomError, CustomError
from errors import CustomError

authentication_routes = Blueprint("authentication", __name__)

Expand Down
Loading
Loading