From f25c80587a732e64a5c6a5018b29a045529a2094 Mon Sep 17 00:00:00 2001 From: Jonas Gabriel Date: Fri, 3 Nov 2023 13:49:34 -0300 Subject: [PATCH] classifiers small fixes --- api/DataProcesser.py | 34 +++++++++----------- api/NbLinRegressionModel.py | 2 +- api/NbNewsModel.py | 10 +++--- api/models/nb_news.py | 2 +- api/models/text_classification_pipeline.pkl | Bin 5389646 -> 5389722 bytes 5 files changed, 22 insertions(+), 26 deletions(-) diff --git a/api/DataProcesser.py b/api/DataProcesser.py index be56cbda..7c82fff2 100644 --- a/api/DataProcesser.py +++ b/api/DataProcesser.py @@ -7,19 +7,19 @@ from nltk.corpus import stopwords # mais imports - class DataProcesser(): - df = pd.DataFrame() input_column = '' - stopwordsenglish = nltk.corpus.stopwords.words('english') - + stopwords_english = stopwords.words('english') def handle_classify(self, df, classifier): - if classifier == 'a': - return self.classify_emotions(df) - elif classifier == 'b': - return self.nb_news_application(df) + classifier_switcher = { + 0: self.classify_emotions, + 1: self.nb_news_application, + 2: self.lin_regression_model + } + + return classifier_switcher.get(classifier, lambda: "Invalid Classifier")(df) def preprocess_text(self, texto): if self.input_column is not None: # Verifique se a coluna foi definida @@ -27,34 +27,32 @@ def preprocess_text(self, texto): texto = re.sub('[^a-z\s]', '', texto.lower()) # tokenizo em palavras e elimino as stopwords palavras = [w for w in texto.split( - ) if w not in set(self.stopwordsenglish)] + ) if w not in set(self.stopwords_english)] palavras = [w for w in texto if nltk.corpus.wordnet.synsets(w)] # texto_junto = ' '.join(texto) # junto as palavras novamente com espaços - return ' '.join(palavras) + return ''.join(palavras) else: return "Coluna não escolhida. Escolha a coluna primeiro." - def nb_news_application(self, df): - nb_model = NbNewsModel(df) - df_result = nb_model.filter_and_classify() - return df_result - def classify_emotions(self, df): df['output_column'] = df['input_column'].apply( self.preprocess_text).apply(make_prediction) + result_csv = df # converte o df pra csv return result_csv def lin_regression_model(self, df): df['output_column'] = df['input_column'].apply( self.preprocess_text).apply(make_prediction_nblin) + result_csv = df # converte o df pra csv return result_csv - def nb_news_application(self): - self.df['coluna_classificada'] = self.df[self.input_column].apply(self.preprocess_text).apply(news_prediction) - result_csv = self.df + def nb_news_application(self, df): + df['output_column'] = df['input_column'].apply(news_prediction) + + result_csv = df return result_csv ##TODO métodos com o processamento de classificação diff --git a/api/NbLinRegressionModel.py b/api/NbLinRegressionModel.py index 4281320a..472aecd8 100644 --- a/api/NbLinRegressionModel.py +++ b/api/NbLinRegressionModel.py @@ -7,7 +7,7 @@ def make_prediction_nblin(my_sentence): with open("./models/linear_reg.pkl", "rb") as f: model = pickle.load(f) - new_sentence = vectorizer.transform(my_sentence) + new_sentence = vectorizer.transform([my_sentence]) prediction = model.predict(new_sentence) if prediction == 0: diff --git a/api/NbNewsModel.py b/api/NbNewsModel.py index 174def97..bbe8f9ba 100644 --- a/api/NbNewsModel.py +++ b/api/NbNewsModel.py @@ -2,18 +2,16 @@ import pickle def news_prediction(texts): - model_file = "api/models/text_classification_pipeline.pkl" + model_file = "./models/text_classification_pipeline.pkl" try: # Carregando o pipeline do arquivo .pkl with open(model_file, 'rb') as model_file: pipeline = pickle.load(model_file) # Fazendo previsões para os textos - predictions = pipeline.predict(texts) + predictions = pipeline.predict([texts]) + + return predictions[0] - return predictions - except Exception as e: return str(e) -# df = pd.read_csv("api/training_df/nb_news.csv") -# print(news_prediction(df['short_description'])) \ No newline at end of file diff --git a/api/models/nb_news.py b/api/models/nb_news.py index db90a8dd..b32627d8 100644 --- a/api/models/nb_news.py +++ b/api/models/nb_news.py @@ -6,7 +6,7 @@ from sklearn.model_selection import train_test_split from sklearn.naive_bayes import MultinomialNB -df = pd.read_csv("api/training_df/nb_news.csv") +df = pd.read_csv("../training_df/nb_news.csv") # Dividindo os dados em um conjunto de treinamento e um conjunto de teste x = df['short_description'] y = df['category'] diff --git a/api/models/text_classification_pipeline.pkl b/api/models/text_classification_pipeline.pkl index be499f23662c40d7225f9e34fcab3b9f3f59556b..7ae389e6e3e0d1ae784663b9360c3bb921d0b816 100644 GIT binary patch delta 902 zcmaLV%TE(Q90%}ryM6GGhde5Rq9XE=N(B_KJVXR!sSgqpE@U%or_O3^o87hs>xlDAyG=KIhZpMANBH&bA9n9>W<*%*82ReIU<_!P!S9< zMfC&^-s?5oH_2T}HNt~W>&$V>^}5uduu!ga-4&lKLR5vyBob$Y8{!=#;>^F>ut$jy zT}S1d{n?OInJ>t3-QV?lL*~16*SuK$*WEKY?Y-+|&E6S~n&)gLJ+k2fH+Y}`3ZV#! zVIP!0DU?Au?1u`dges^8FC2gxI0&^+2Z!J=)WZ=t3df)Ujzc4yfF@{$7HBm`HrhNZ z|H(`jPp0W&JNV!fbbudDLnm}WH=Kd9&;z~D2j?IF{V)K7Fa+me7%spFT!c{=gK@Y7 n6ClAP1mQAVfvYB6yyjl{*Bv|vOg+ipeS7~@tqVUva@O}941iy1 delta 797 zcmaLUKTH#G6bJC$pI%!*KrPjlDo908^eD9=Xe$Du;Blf&oEYlSUcDM)YOiQ84m!w5 z(h`#X>Ln(`)x?1`kX%@t9Bz?`0S6gOs*^)UHKu-BH8G<8lF#sdzxVQfFYnn`lV1N~ z((FUFkSwyRRQ?mykvFJ5k0Y9lC!+DBbc1A@)L1e#%d06Vt;&3Ul3t({-%5#2ls#c( zP_R5E%8*7Pq)VX0i4dZ(`C87wYFa;3d}55wvUg=QL&OIIS=&wNzAUFDBEIw%zBSj{S&Zi<_qvgZD%bVI zCYQ>$(aw>&+tGw9WoFW{8YQ9-%GzVMBvq|O%Ab=7&e~wiwcg45?1TnrgeKSpyI~JFp&4AT7xsZ0JkSDO*bfJw z724n+w1W>0K?fX$PUwOoaMW1b>b5Wc=g-C5F|C+84m|XLA9^7GCm;wXp$|?$KMcTW zI0GRF!&w-Fb1($MFaqac6fVF;7=v-R1OiNe2$x|Jt{BDKRon8vcCa3byr`{xHXLc; Ju>YB9{yXs9OhW(w