Mega commit to pytorch/RNN switch with progressive progress bar and w…

…orking classification
TailUFPB · Apr 6, 2024 · af74d83 · af74d83
1 parent b77c608
commit af74d83
Show file tree

Hide file tree

Showing 9 changed files with 517 additions and 166 deletions.
diff --git a/api/DataProcesser.py b/api/DataProcesser.py
@@ -3,20 +3,21 @@
 from NbEmotionsModel import make_prediction
 from NbLinRegressionModel import make_prediction_nblin
 from available_classifiers import get_available_classifiers
-from tensorflow.python.keras.models import load_model
 
-import scipy as sp
-import pandas as pd
-import numpy as np
-import tensorflow as tf
+import Neural_Network2
 import pickle
 import re
 import joblib
+import numpy as np
 import string
 import os
-from sklearn.feature_extraction.text import TfidfVectorizer
+import pandas as pd
+import torch
+from collections import Counter
+from functools import partial
 
 import nltk
+from nltk.tokenize import wordpunct_tokenize
 from nltk.corpus import stopwords
 nltk.download('stopwords')
 # mais imports
@@ -31,7 +32,7 @@ def handle_classify(self, df, classifier):
  model_name = classifier_switcher[classifier]
  if model_name.endswith('.pkl'):
  return self.pretrained_predict(df, model_name)
- elif model_name.endswith('.h5'):
+ else:
  return self.trained_predict(df, model_name)
  #classifier_switcher = {
  # 0: self.classify_emotions,
@@ -54,16 +55,13 @@ def generate_statistics(self, df):
 
 
  def preprocess_text(self, text):
- text = str(text).lower()
- text = re.sub('\[.*?\]', '', text)
- text = re.sub("\\W", " ", text)
- text = re.sub('https?://\S+|www\.\S+', '', text)
- text = re.sub('<.*?>+', '', text)
- text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
- text = re.sub('\n', '', text)
- text = re.sub('\w*\d\w*', '', text)
-
- return text
+ stop_words = set(stopwords.words('english'))
+ text = str(text)
+ text = re.sub(r'[^\w\s]', '', text)
+ text = text.lower()
+ tokens = wordpunct_tokenize(text)
+ tokens = [token for token in tokens if token not in stop_words]
+ return tokens
 
 
  def classify_emotions(self, df):
@@ -90,44 +88,80 @@ def pretrained_predict(self, df, model_name):
  return df
 
  def load_weights_and_model(self, name):
- model_filename = f"api/models/{name}"
- num_classes = model_filename[model_filename.index("s") + 2 : model_filename.index("-")]
- model = tf.keras.Sequential([
- tf.keras.layers.Embedding(input_dim=20000, output_dim=128),
- tf.keras.layers.LSTM(64),
- tf.keras.layers.Dense(int(num_classes), activation='softmax')
- ])
- model.load_weights(model_filename)
- return model
-
- def trained_predict(self, df, model_name):
- model = self.load_weights_and_model(model_name)
-
-
- encoder_name = model_name[model_name.index('l') + 2 : model_name.index('.')]
+ model_filename = os.path.join("api", "models", name)
+ if os.path.exists(model_filename):
+ model = torch.load(model_filename)
+ return model
+ else:
+ raise FileNotFoundError(f"Model file '{model_filename}' not found.")
+
+ def predict_with_model(self, data, model):
+ # Convert data to PyTorch tensor if needed
+ data_tensor = torch.tensor(data, dtype=torch.float32)
+ # Perform prediction
+ predictions = model(data_tensor)
+ return predictions
 
- label_map_filename = f"api\encoders/LabelMapping-{encoder_name}.joblib"
+ def trained_predict(self, df, model_name):
+ label_map_filename = f"api/encoders/LabelMapping-{model_name}.joblib"
  label_encoder = joblib.load(label_map_filename)
 
- raw_text = df['input_column'].tolist()
-
- # prediction (nao sei como fazer agora) 
- # vectorizer = TfidfVectorizer(max_features=20000)
- # raw_text = [self.preprocess_text(text).encode("utf-8") for text in raw_text]
- # vectorizer.fit_transform(raw_text)
- # vectorized_data = vectorizer.transform(raw_text)
-
- # vectorized_data = np.asarray(vectorized_data.todense())
-
- # # Make predictions using the model
-
- # predictions = model.predict(vectorized_data)
-
- # predicted_labels_encoded = tf.argmax(predictions, axis=1).numpy()
-
- # predicted_labels = [label_encoder.classes_[label] for label in predicted_labels_encoded]
+ model = self.load_weights_and_model(model_name)
+ model.eval()
 
- # df['output_column'] = predicted_labels
+ stop_words = set(stopwords.words('english'))
+
+ df['tokens'] = df.input_column.progress_apply(
+ partial(Neural_Network2.tokenize, stop_words=stop_words),
+ )
+
+ # Replace rare words with <UNK>
+ all_tokens = [sublst for lst in df.tokens.tolist() for sublst in lst]
+ common_tokens = set(list(zip(
+ *Counter(all_tokens).most_common(20000)))[0])
+ df.loc[:, 'tokens'] = df.tokens.progress_apply(
+ partial(
+ Neural_Network2.remove_rare_words,
+ common_tokens=common_tokens,
+ max_len=200,
+ ),
+ )
+
+ # Remove sequences with only <UNK>
+ df = df[df.tokens.progress_apply(
+ lambda tokens: any(token != '<UNK>' for token in tokens),
+ )]
+
+ vocab = sorted({
+ sublst for lst in df.tokens.tolist() for sublst in lst
+ })
+ self.token2idx = {token: idx for idx, token in enumerate(vocab)}
+
+ # Add a padding idx
+ self.token2idx['<PAD>'] = max(self.token2idx.values()) + 1
+
+ self.idx2token = {idx: token for token, idx in self.token2idx.items()}
+
+ df['indexed_tokens'] = df.tokens.apply(
+ lambda tokens: [self.token2idx[token] for token in tokens],
+ )
+
+ # Decode predictions using label_encoder
+
+
+ predictions = []
+ for input_column_row in df['indexed_tokens']:
+ with torch.no_grad():
+ _, logits = model([input_column_row], return_activations=True)
+ logits = logits.detach().cpu().numpy()
+ prediction = np.argmax(logits, axis=1)[0]
+ predictions.append(prediction)
+
+ # Decode predictions using label encoder
+ decoded_predictions = label_encoder.inverse_transform(predictions)
+
+ # Assign decoded predictions back to the DataFrame
+ df['output_column'] = decoded_predictions
 
  return df