diff --git a/api/DataProcesser.py b/api/DataProcesser.py index be56cbda..7c82fff2 100644 --- a/api/DataProcesser.py +++ b/api/DataProcesser.py @@ -7,19 +7,19 @@ from nltk.corpus import stopwords # mais imports - class DataProcesser(): - df = pd.DataFrame() input_column = '' - stopwordsenglish = nltk.corpus.stopwords.words('english') - + stopwords_english = stopwords.words('english') def handle_classify(self, df, classifier): - if classifier == 'a': - return self.classify_emotions(df) - elif classifier == 'b': - return self.nb_news_application(df) + classifier_switcher = { + 0: self.classify_emotions, + 1: self.nb_news_application, + 2: self.lin_regression_model + } + + return classifier_switcher.get(classifier, lambda: "Invalid Classifier")(df) def preprocess_text(self, texto): if self.input_column is not None: # Verifique se a coluna foi definida @@ -27,34 +27,32 @@ def preprocess_text(self, texto): texto = re.sub('[^a-z\s]', '', texto.lower()) # tokenizo em palavras e elimino as stopwords palavras = [w for w in texto.split( - ) if w not in set(self.stopwordsenglish)] + ) if w not in set(self.stopwords_english)] palavras = [w for w in texto if nltk.corpus.wordnet.synsets(w)] # texto_junto = ' '.join(texto) # junto as palavras novamente com espaços - return ' '.join(palavras) + return ''.join(palavras) else: return "Coluna não escolhida. Escolha a coluna primeiro." - def nb_news_application(self, df): - nb_model = NbNewsModel(df) - df_result = nb_model.filter_and_classify() - return df_result - def classify_emotions(self, df): df['output_column'] = df['input_column'].apply( self.preprocess_text).apply(make_prediction) + result_csv = df # converte o df pra csv return result_csv def lin_regression_model(self, df): df['output_column'] = df['input_column'].apply( self.preprocess_text).apply(make_prediction_nblin) + result_csv = df # converte o df pra csv return result_csv - def nb_news_application(self): - self.df['coluna_classificada'] = self.df[self.input_column].apply(self.preprocess_text).apply(news_prediction) - result_csv = self.df + def nb_news_application(self, df): + df['output_column'] = df['input_column'].apply(news_prediction) + + result_csv = df return result_csv ##TODO métodos com o processamento de classificação diff --git a/api/NbLinRegressionModel.py b/api/NbLinRegressionModel.py index 4281320a..472aecd8 100644 --- a/api/NbLinRegressionModel.py +++ b/api/NbLinRegressionModel.py @@ -7,7 +7,7 @@ def make_prediction_nblin(my_sentence): with open("./models/linear_reg.pkl", "rb") as f: model = pickle.load(f) - new_sentence = vectorizer.transform(my_sentence) + new_sentence = vectorizer.transform([my_sentence]) prediction = model.predict(new_sentence) if prediction == 0: diff --git a/api/NbNewsModel.py b/api/NbNewsModel.py index 174def97..bbe8f9ba 100644 --- a/api/NbNewsModel.py +++ b/api/NbNewsModel.py @@ -2,18 +2,16 @@ import pickle def news_prediction(texts): - model_file = "api/models/text_classification_pipeline.pkl" + model_file = "./models/text_classification_pipeline.pkl" try: # Carregando o pipeline do arquivo .pkl with open(model_file, 'rb') as model_file: pipeline = pickle.load(model_file) # Fazendo previsões para os textos - predictions = pipeline.predict(texts) + predictions = pipeline.predict([texts]) + + return predictions[0] - return predictions - except Exception as e: return str(e) -# df = pd.read_csv("api/training_df/nb_news.csv") -# print(news_prediction(df['short_description'])) \ No newline at end of file diff --git a/api/models/nb_news.py b/api/models/nb_news.py index db90a8dd..b32627d8 100644 --- a/api/models/nb_news.py +++ b/api/models/nb_news.py @@ -6,7 +6,7 @@ from sklearn.model_selection import train_test_split from sklearn.naive_bayes import MultinomialNB -df = pd.read_csv("api/training_df/nb_news.csv") +df = pd.read_csv("../training_df/nb_news.csv") # Dividindo os dados em um conjunto de treinamento e um conjunto de teste x = df['short_description'] y = df['category'] diff --git a/api/models/text_classification_pipeline.pkl b/api/models/text_classification_pipeline.pkl index be499f23..7ae389e6 100644 Binary files a/api/models/text_classification_pipeline.pkl and b/api/models/text_classification_pipeline.pkl differ