Skip to content

Commit

Permalink
Mega commit to pytorch/RNN switch with progressive progress bar and w…
Browse files Browse the repository at this point in the history
…orking classification
  • Loading branch information
cmaloney111 committed Apr 6, 2024
1 parent b77c608 commit af74d83
Show file tree
Hide file tree
Showing 9 changed files with 517 additions and 166 deletions.
136 changes: 85 additions & 51 deletions api/DataProcesser.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,20 +3,21 @@
from NbEmotionsModel import make_prediction
from NbLinRegressionModel import make_prediction_nblin
from available_classifiers import get_available_classifiers
from tensorflow.python.keras.models import load_model

import scipy as sp
import pandas as pd
import numpy as np
import tensorflow as tf
import Neural_Network2
import pickle
import re
import joblib
import numpy as np
import string
import os
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import torch
from collections import Counter
from functools import partial

import nltk
from nltk.tokenize import wordpunct_tokenize
from nltk.corpus import stopwords
nltk.download('stopwords')
# mais imports
Expand All @@ -31,7 +32,7 @@ def handle_classify(self, df, classifier):
model_name = classifier_switcher[classifier]
if model_name.endswith('.pkl'):
return self.pretrained_predict(df, model_name)
elif model_name.endswith('.h5'):
else:
return self.trained_predict(df, model_name)
#classifier_switcher = {
# 0: self.classify_emotions,
Expand All @@ -54,16 +55,13 @@ def generate_statistics(self, df):


def preprocess_text(self, text):
text = str(text).lower()
text = re.sub('\[.*?\]', '', text)
text = re.sub("\\W", " ", text)
text = re.sub('https?://\S+|www\.\S+', '', text)
text = re.sub('<.*?>+', '', text)
text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
text = re.sub('\n', '', text)
text = re.sub('\w*\d\w*', '', text)

return text
stop_words = set(stopwords.words('english'))
text = str(text)
text = re.sub(r'[^\w\s]', '', text)
text = text.lower()
tokens = wordpunct_tokenize(text)
tokens = [token for token in tokens if token not in stop_words]
return tokens


def classify_emotions(self, df):
Expand All @@ -90,44 +88,80 @@ def pretrained_predict(self, df, model_name):
return df

def load_weights_and_model(self, name):
model_filename = f"api/models/{name}"
num_classes = model_filename[model_filename.index("s") + 2 : model_filename.index("-")]
model = tf.keras.Sequential([
tf.keras.layers.Embedding(input_dim=20000, output_dim=128),
tf.keras.layers.LSTM(64),
tf.keras.layers.Dense(int(num_classes), activation='softmax')
])
model.load_weights(model_filename)
return model

def trained_predict(self, df, model_name):
model = self.load_weights_and_model(model_name)


encoder_name = model_name[model_name.index('l') + 2 : model_name.index('.')]
model_filename = os.path.join("api", "models", name)
if os.path.exists(model_filename):
model = torch.load(model_filename)
return model
else:
raise FileNotFoundError(f"Model file '{model_filename}' not found.")

def predict_with_model(self, data, model):
# Convert data to PyTorch tensor if needed
data_tensor = torch.tensor(data, dtype=torch.float32)
# Perform prediction
predictions = model(data_tensor)
return predictions

label_map_filename = f"api\encoders/LabelMapping-{encoder_name}.joblib"
def trained_predict(self, df, model_name):
label_map_filename = f"api/encoders/LabelMapping-{model_name}.joblib"
label_encoder = joblib.load(label_map_filename)

raw_text = df['input_column'].tolist()

# prediction (nao sei como fazer agora)
# vectorizer = TfidfVectorizer(max_features=20000)
# raw_text = [self.preprocess_text(text).encode("utf-8") for text in raw_text]
# vectorizer.fit_transform(raw_text)
# vectorized_data = vectorizer.transform(raw_text)

# vectorized_data = np.asarray(vectorized_data.todense())

# # Make predictions using the model

# predictions = model.predict(vectorized_data)

# predicted_labels_encoded = tf.argmax(predictions, axis=1).numpy()

# predicted_labels = [label_encoder.classes_[label] for label in predicted_labels_encoded]
model = self.load_weights_and_model(model_name)
model.eval()

# df['output_column'] = predicted_labels
stop_words = set(stopwords.words('english'))

df['tokens'] = df.input_column.progress_apply(
partial(Neural_Network2.tokenize, stop_words=stop_words),
)

# Replace rare words with <UNK>
all_tokens = [sublst for lst in df.tokens.tolist() for sublst in lst]
common_tokens = set(list(zip(
*Counter(all_tokens).most_common(20000)))[0])
df.loc[:, 'tokens'] = df.tokens.progress_apply(
partial(
Neural_Network2.remove_rare_words,
common_tokens=common_tokens,
max_len=200,
),
)

# Remove sequences with only <UNK>
df = df[df.tokens.progress_apply(
lambda tokens: any(token != '<UNK>' for token in tokens),
)]

vocab = sorted({
sublst for lst in df.tokens.tolist() for sublst in lst
})
self.token2idx = {token: idx for idx, token in enumerate(vocab)}

# Add a padding idx
self.token2idx['<PAD>'] = max(self.token2idx.values()) + 1

self.idx2token = {idx: token for token, idx in self.token2idx.items()}

df['indexed_tokens'] = df.tokens.apply(
lambda tokens: [self.token2idx[token] for token in tokens],
)

# Decode predictions using label_encoder


predictions = []
for input_column_row in df['indexed_tokens']:
with torch.no_grad():
_, logits = model([input_column_row], return_activations=True)
logits = logits.detach().cpu().numpy()
prediction = np.argmax(logits, axis=1)[0]
predictions.append(prediction)

# Decode predictions using label encoder
decoded_predictions = label_encoder.inverse_transform(predictions)

# Assign decoded predictions back to the DataFrame
df['output_column'] = decoded_predictions

return df

Expand Down
Loading

0 comments on commit af74d83

Please sign in to comment.