diff --git a/api/DataProcesser.py b/api/DataProcesser.py index 6eb67f29..3d75d800 100644 --- a/api/DataProcesser.py +++ b/api/DataProcesser.py @@ -19,6 +19,7 @@ import torch from collections import Counter from functools import partial +import pickle import nltk from nltk.tokenize import wordpunct_tokenize @@ -35,8 +36,11 @@ def handle_classify(self, df, classifier): classifier_switcher = get_available_classifiers() # id: nome_arquivo model_name = classifier_switcher[classifier] if model_name.endswith('.pkl'): - pipeline = self.get_pipeline(model_name) - return self.pretrained_predict(df, pipeline) + pipeline, custom = self.get_pipeline(model_name) + if custom: + return self.pretrained_predict(df, pipeline, model_name) + else: + return self.pretrained_predict(df, pipeline) else: return self.trained_predict(df, model_name) #classifier_switcher = { @@ -58,8 +62,9 @@ def get_pipeline(self, model_name): df = pd.read_csv('api/training_df/nb_news.csv') train_data, test_data, train_target, test_target = train_test_split(df['short_description'], df['category'], test_size=0.2, shuffle=True) else: - return None - return make_pipeline(TfidfVectorizer(), MultinomialNB()).fit(train_data, train_target) + with open(f'api/models/{model_name}', 'rb') as file: + return pickle.load(file), True + return make_pipeline(TfidfVectorizer(), MultinomialNB()).fit(train_data, train_target), False def generate_statistics(self, df): unique_labels = df['output_column'].unique() @@ -95,11 +100,15 @@ def nb_news_application(self, df): df['output_column'] = df['input_column'].apply(news_prediction) return df - def pretrained_predict(self, df, pipeline): + def pretrained_predict(self, df, pipeline, model_name = None): + if model_name: + label_map_filename = f"api/encoders/LabelMapping-{model_name.split('_')[0]}.joblib" + label_encoder = joblib.load(label_map_filename) texts_to_predict = df['input_column'] texts_to_predict = [str(text) for text in texts_to_predict] predictions = pipeline.predict(texts_to_predict) - df['output_column'] = predictions + label_predictions = label_encoder.inverse_transform(predictions) + df['output_column'] = label_predictions return df def load_weights_and_model(self, name): diff --git a/api/Neural_Network.py b/api/Neural_Network.py new file mode 100644 index 00000000..8b82df08 --- /dev/null +++ b/api/Neural_Network.py @@ -0,0 +1,103 @@ +import re +import os +import joblib +import json +import pickle +from functools import partial +from collections import Counter + +import pandas as pd +from nltk.corpus import stopwords +from nltk import wordpunct_tokenize +from tqdm import tqdm +from sklearn.preprocessing import LabelEncoder +from sklearn.feature_extraction.text import TfidfVectorizer +from sklearn.naive_bayes import MultinomialNB +from sklearn.pipeline import make_pipeline +from sklearn.model_selection import train_test_split +from sklearn.metrics import accuracy_score, classification_report + +tqdm.pandas() + +import nltk +nltk.download('stopwords') + +def tokenize(text, stop_words): + text = str(text) + text = re.sub(r'[^\w\s]', '', text) + text = text.lower() + tokens = wordpunct_tokenize(text) + tokens = [token for token in tokens if token not in stop_words] + return tokens + +class CustomDataset: + def __init__(self, df, max_vocab, max_len, name): + stop_words = set(stopwords.words('english')) + + df['tokens'] = df.input_text.progress_apply( + partial(tokenize, stop_words=stop_words), + ) + + all_tokens = [token for tokens in df.tokens.tolist() for token in tokens] + common_tokens = set([token for token, _ in Counter(all_tokens).most_common(max_vocab)]) + df['tokens'] = df.tokens.progress_apply( + partial(remove_rare_words, common_tokens=common_tokens, max_len=max_len), + ) + + df = df[df.tokens.progress_apply(lambda tokens: any(token != '' for token in tokens))] + + df['clean_text'] = df.tokens.apply(lambda tokens: ' '.join(tokens)) + + self.text = df.clean_text.tolist() + self.labels = df.labels.tolist() + + label_encoder = LabelEncoder() + self.encoded_labels = label_encoder.fit_transform(df['labels']) + encoder_name = f"LabelMapping-{name}.joblib" + encoder_filename = os.path.join("api", "encoders", encoder_name) + os.makedirs(os.path.dirname(encoder_filename), exist_ok=True) + joblib.dump(label_encoder, encoder_filename) + +def remove_rare_words(tokens, common_tokens, max_len): + return [token if token in common_tokens else '' for token in tokens][-max_len:] + +def create_and_train_nb_model(df, name, epochs = 10, batch_size = 16, learning_rate = 0.001, valid_ratio=0.05, test_ratio=0.05): + max_vocab = 20000 + max_len = 200 + + dataset = CustomDataset(df, max_vocab, max_len, name) + + X_train, X_temp, y_train, y_temp = train_test_split( + dataset.text, dataset.encoded_labels, test_size=valid_ratio + test_ratio, random_state=42) + X_valid, X_test, y_valid, y_test = train_test_split(X_temp, y_temp, test_size=test_ratio / (valid_ratio + test_ratio), random_state=42) + + # Creating a pipeline with TF-IDF vectorizer and Multinomial Naive Bayes + pipeline = make_pipeline(TfidfVectorizer(max_features=max_vocab), MultinomialNB()) + + # Fitting the model to the training data + pipeline.fit(X_train, y_train) + + # Evaluating on validation set + valid_preds = pipeline.predict(X_valid) + valid_acc = accuracy_score(y_valid, valid_preds) + print(f'Validation Accuracy: {valid_acc:.2f}') + print(f'Validation Report:\n{classification_report(y_valid, valid_preds)}') + + # Evaluating on test set + test_preds = pipeline.predict(X_test) + test_acc = accuracy_score(y_test, test_preds) + print(f'Test Accuracy: {test_acc:.2f}') + print(f'Test Report:\n{classification_report(y_test, test_preds)}') + + # Saving the pipeline to a file + model_path = os.path.join('api', 'models', f"{name}_pipeline.pkl") + os.makedirs(os.path.dirname(model_path), exist_ok=True) + with open(model_path, "wb") as model_file: + pickle.dump(pipeline, model_file) + + training_progress = { + 'training_progress': 0, + 'training_in_progress': False + } + with open('training_progress.json', 'w') as file: + json.dump(training_progress, file) \ No newline at end of file diff --git a/api/Neural_Network2.py b/api/Neural_Network2.py index a3314d18..80440fe8 100644 --- a/api/Neural_Network2.py +++ b/api/Neural_Network2.py @@ -312,8 +312,7 @@ def validate_epoch(model, valid_loader, criterion): return total_loss / total - -def create_and_train_model(df, name, epochs=10, batch_size=32, learning_rate=0.001): +def create_and_train_rnn_model(df, name, epochs = 10, batch_size = 32, learning_rate = 0.001): # Configurações iniciais e preparações do modelo dropout_probability = 0.2 n_rnn_layers = 1 diff --git a/api/app.py b/api/app.py index 54ce00a6..bfa800c7 100644 --- a/api/app.py +++ b/api/app.py @@ -1,7 +1,8 @@ from flask import Flask, jsonify, request from flask_cors import CORS from DataProcesser import DataProcesser -from Neural_Network2 import create_and_train_model +from Neural_Network2 import create_and_train_rnn_model +from Neural_Network import create_and_train_nb_model from available_classifiers import get_available_classifiers import time @@ -63,8 +64,8 @@ def shutdown(): shutdown_server() return 'Server shutting down...' -@app.route('/neural-network', methods=["POST"]) -def train_model(): +@app.route('/neural-network-rnn', methods=["POST"]) +def train_rnn_model(): received_data = request.json if received_data: @@ -98,7 +99,45 @@ def train_model(): df = pd.DataFrame({'input_text': selected_data, 'labels': selected_label}) - create_and_train_model(df, name, epochs, batch_size, learning_rate) + create_and_train_rnn_model(df, name, epochs, batch_size, learning_rate) + + return jsonify({"message": "Model train started successfully."}), 200 + +@app.route('/neural-network-nb', methods=["POST"]) +def train_nb_model(): + received_data = request.json + + if received_data: + selected_data = received_data.get('data') + selected_label = received_data.get('label') + epochs = received_data.get('epochs') + batch_size = received_data.get('batch_size') + learning_rate = received_data.get('learning_rate') + name = received_data.get('name') + + # + print("\n") + print("Received data: " + str(len(selected_data))) + print("Received label: " + str(len(selected_label))) + print("Name: " + str(name)) + print("Epochs: " + str(epochs)) + print("Batch Size: " + str(batch_size)) + print("Learning Rate: " + str(learning_rate)) + print("\n") + else: + return jsonify({"message": "No data received."}), 400 + + # reseta status + training_progress = { + 'training_progress': 0, + 'training_in_progress': True + } + with open('training_progress.json', 'w') as file: + json.dump(training_progress, file) + + df = pd.DataFrame({'input_text': selected_data, 'labels': selected_label}) + + create_and_train_nb_model(df, name, epochs, batch_size, learning_rate) return jsonify({"message": "Model train started successfully."}), 200 diff --git a/src/pages/train.tsx b/src/pages/train.tsx index 51c87bad..c20d55bf 100644 --- a/src/pages/train.tsx +++ b/src/pages/train.tsx @@ -3,7 +3,6 @@ import SelectFileCard from "../components/selectFileCard/selectFileCard"; import axios from "axios"; import ResultTable from "../components/resultTable/resultTable"; import { Menu } from "../components/menu/menu"; -import ReactApexChart from "react-apexcharts"; import { ReactApexChartsDefaultOptions } from "../Shared/apexChartsOptions"; import Layout from "./layout/layout"; import TrainView from "./views/trainView"; @@ -13,4 +12,4 @@ export default function Train() { return ( ); -} \ No newline at end of file +} diff --git a/src/pages/views/trainView.tsx b/src/pages/views/trainView.tsx index de41e98f..5836b8a2 100644 --- a/src/pages/views/trainView.tsx +++ b/src/pages/views/trainView.tsx @@ -37,7 +37,7 @@ export default function TrainView() { setIsCancelling(false); }; - const handleSubmit = async () => { + const handleRnnSubmit = async () => { setIsLoading(true); setHasTrained(true); setLoadingProgress(0); @@ -63,25 +63,55 @@ export default function TrainView() { console.log(sendData); - const url = "http://localhost:5000/neural-network"; - - await axios - .post(url, sendData) - .catch(async (error) => { - await axios - .post(url, sendData) - .catch(async (error) => { - await axios - .post(url, sendData) - .catch(async (error) => { - await axios - .post(url, sendData) - .catch((error) => { - throw new Error(error); - }) - }) - }) + const url = "http://localhost:5000/neural-network-rnn"; + + await axios.post(url, sendData).catch(async (error) => { + await axios.post(url, sendData).catch(async (error) => { + await axios.post(url, sendData).catch(async (error) => { + await axios.post(url, sendData).catch((error) => { + throw new Error(error); + }); + }); + }); + }); + + setIsLoading(false); + }; + + const handleNbSubmit = async () => { + setIsLoading(true); + setLoadingProgress(0); + + let selectedData = data.map((row) => ({ + value: row[selectedColumn], + label: row[selectedLabel], + })); + + let selectedLabels = data.map((row) => row[selectedLabel]); + let selectedValues = data.map((row) => row[selectedColumn]); + + const sendData = { + data: selectedValues, + label: selectedLabels, + batch_size: batchSize || 16, + epochs: epochs || 50, + learning_rate: learningRate || 0.001, + name: modelName || "trained-model", + }; + + console.log(sendData); + + const url = "http://localhost:5000/neural-network-nb"; + + await axios.post(url, sendData).catch(async (error) => { + await axios.post(url, sendData).catch(async (error) => { + await axios.post(url, sendData).catch(async (error) => { + await axios.post(url, sendData).catch((error) => { + throw new Error(error); + }); + }); }); + }); setIsLoading(false); }; @@ -131,9 +161,14 @@ export default function TrainView() { useEffect(() => { const fetchData = async () => { try { - const response = await axios.get("http://localhost:5000/training-status"); + const response = await axios.get( + "http://localhost:5000/training-status" + ); const { training_progress, training_in_progress, train_losses, valid_losses } = response.data; - const newProgress: number = training_in_progress || training_progress === 100 ? training_progress : 0; + const newProgress: number = + training_in_progress || training_progress === 100 + ? training_progress + : 0; // Explicitly type newProgress updateLoadingProgress(newProgress); setTrainLosses(train_losses); @@ -144,6 +179,7 @@ export default function TrainView() { }; const updateLoadingProgress = (newProgress: number) => { + // Explicitly type newProgress parameter const duration = 1000; const startTime = Date.now(); const startProgress = prevLoadingProgressRef.current; @@ -151,7 +187,8 @@ export default function TrainView() { const updateProgress = () => { const elapsedTime = Date.now() - startTime; const progress = Math.min(1, elapsedTime / duration); - const interpolatedProgress = startProgress + (newProgress - startProgress) * progress; + const interpolatedProgress = + startProgress + (newProgress - startProgress) * progress; setLoadingProgress(interpolatedProgress); if (progress < 1) { @@ -295,7 +332,7 @@ export default function TrainView() { {!isLoading &&