Merge pull request #4 from TailUFPB/nn-fakenews

Nn fakenews
TailUFPB · Dec 8, 2023 · a9fcff4 · a9fcff4
2 parents ee637b2 + 1f24f07
commit a9fcff4
Show file tree

Hide file tree

Showing 20,010 changed files with 65,263 additions and 1 deletion.
diff --git a/api/NeuralNetworkFakeNews b/api/NeuralNetworkFakeNews
@@ -0,0 +1,17 @@
+import pandas as pd
+import pickle
+
+def nn_fakenews_precition(texts):
+ model_file = "api/models/nn_fakenews_model.pkl"
+ try:
+ # Carregando o pipeline do arquivo .pkl
+ with open(model_file, 'rb') as model_file:
+ pipeline = pickle.load(model_file)
+
+ # Fazendo previsões para os textos
+ predictions = pipeline.predict(texts)
+
+ return predictions
+
+ except Exception as e:
+ return str(e)
diff --git a/api/Neural_Network2.py b/api/Neural_Network2.py
@@ -0,0 +1,143 @@
+import csv
+import json
+import re
+import string
+import os
+import pandas as pd
+import numpy as np
+import tensorflow as tf
+import pickle
+from tensorflow.keras import layers
+from tensorflow.keras.layers import TextVectorization
+from tensorflow.keras.models import Sequential
+
+def preprocess_text(text):
+ text = text.lower()
+ text = re.sub('\[.*?\]', '', text)
+ text = re.sub("\\W", " ", text)
+ text = re.sub('https?://\S+|www\.\S+', '', text)
+ text = re.sub('<.*?>+', '', text)
+ text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
+ text = re.sub('\n', '', text)
+ text = re.sub('\w*\d\w*', '', text)
+ return text
+
+def create_and_train_model(train_texts, train_labels,name, epochs=5):
+ # Cria um DataFrame do Pandas com os textos e rótulos de treino
+ train_df = pd.DataFrame({'text': train_texts, 'label': train_labels})
+ # Aplica o pré-processamento aos textos
+ train_df['text'] = train_df['text'].apply(preprocess_text)
+
+ # Cria um diretório para armazenar os textos de treino como arquivos individuais
+ output_directory = 'arquivos_texto_treino_nn'
+ os.makedirs(output_directory, exist_ok=True)
+
+ # Escreve cada texto em um arquivo separado dentro do diretório criado
+ for index, row in train_df.iterrows():
+ filename = os.path.join(output_directory, f'texto_{index}.txt')
+ with open(filename, 'w', encoding='utf-8') as file:
+ file.write(row['text'])
+
+ # Diretório de treino para o modelo
+ treino_dir = output_directory
+
+ # Cria um conjunto de dados de texto usando a API de conjuntos de dados do TensorFlow
+ train_dataset = tf.keras.utils.text_dataset_from_directory(
+ treino_dir,
+ batch_size=32,
+ shuffle=True,
+ )
+
+ # Parâmetros do modelo
+ max_features = 20000
+ embedding_dim = 128
+ sequence_length = 500
+
+ # Cria uma camada de vetorização de texto
+ vectorize_layer = TextVectorization(
+ max_tokens=max_features,
+ output_mode="int",
+ output_sequence_length=sequence_length,
+ )
+
+ # Adapta a camada de vetorização ao conjunto de dados de texto
+ text_ds = train_dataset.map(lambda x, y: x)
+ vectorize_layer.adapt(text_ds)
+
+ # Função para vetorizar o texto e manter os rótulos
+ def vectorize_text(text, label):
+ text = tf.expand_dims(text, -1)
+ return vectorize_layer(text), label
+
+ # Aplica a vetorização ao conjunto de dados de treino
+ train_ds = train_dataset.map(vectorize_text)
+ train_ds = train_ds.cache().prefetch(buffer_size=10)
+
+ # Define a arquitetura do modelo
+ inputs = tf.keras.Input(shape=(None,), dtype="int64")
+ x = layers.Embedding(max_features, embedding_dim)(inputs)
+ x = layers.Dropout(0.5)(x)
+ x = layers.Conv1D(128, 7, padding="valid", activation="relu", strides=3)(x)
+ x = layers.Conv1D(128, 7, padding="valid", activation="relu", strides=3)(x)
+ x = layers.GlobalMaxPooling1D()(x)
+ x = layers.Dense(128, activation="relu")(x)
+ x = layers.Dropout(0.5)(x)
+ predictions = layers.Dense(1, activation="sigmoid", name="predictions")(x)
+
+ # Cria e compila o modelo
+ model = tf.keras.Model(inputs, predictions)
+ model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
+
+ # Treina o modelo
+ history = model.fit(train_ds, epochs=epochs)
+ # Nome do arquivo
+ name = f"nn_{name}.pkl"
+ # Saving the model
+ with open(name, "wb") as model_file:
+ pickle.dump(model, model_file) 
+
+ # Obtém estatísticas do treinamento
+ training_stats = {
+ "loss": history.history['loss'],
+ "accuracy": history.history['accuracy']
+ }
+
+ # Retorna estatísticas como JSON
+ return json.dumps(training_stats)
+
+'''
+Com o nome do arquivo podemos fazer por exemplo:
+
+saved_model_filename = Neural_Network(texts_train, labels_train)
+
+Carregar o modelo treinado a partir do arquivo:
+with open(saved_model_filename, "rb") as model_file:
+ loaded_model = pickle.load(model_file)
+
+Agora, podemos usar loaded_model para fazer previsões, por exemplo:
+predictions = loaded_model.predict(new_texts)
+
+'''
+'''
+TESTE:
+
+df_true = pd.read_csv("Linguifai/api/training_df/True.csv")
+df_fake = pd.read_csv("Linguifai/api/training_df/Fake.csv")
+
+
+df_fake = df_fake.drop(['title', 'subject', 'date'], axis=1)
+df_true = df_true.drop(['title', 'subject', 'date'], axis=1)
+
+
+df_fake['text'] = df_fake["text"]
+df_true['text'] = df_true["text"]
+
+df_fake_train = df_fake[:5000]
+df_true_train = df_true[:5000]
+
+textos = df_fake_train['text'].tolist() + df_true_train['text'].tolist()
+labels = [0] * len(df_fake_train) + [1] * len(df_true_train)
+
+create_and_train_model(textos,labels,"Teste")
+
+'''
diff --git a/api/__pycache__/DataProcesser.cpython-39.pyc b/api/__pycache__/DataProcesser.cpython-39.pyc
diff --git a/api/__pycache__/NbEmotionsModel.cpython-39.pyc b/api/__pycache__/NbEmotionsModel.cpython-39.pyc
diff --git a/api/__pycache__/NbNewsModel.cpython-39.pyc b/api/__pycache__/NbNewsModel.cpython-39.pyc
diff --git a/api/app.py b/api/app.py
@@ -1,13 +1,14 @@
 from flask import Flask, jsonify, request
 from flask_cors import CORS
 from DataProcesser import DataProcesser
-from available_classifiers import get_available_classifiers
 
+from available_classifiers import get_available_classifiers
 import os
 import atexit
 import threading
 import pandas as pd
 import nltk
+import pandas as pd
 import json
 nltk.download('wordnet')
 
@@ -54,6 +55,14 @@ def shutdown():
  shutdown_server()
  return 'Server shutting down...'
 
+@app.route('/neural-network',methods=["POST"])
+def train_model():
+ received_data = request.get_json()
+ selected_data = received_data.get('data')
+ selected_label = received_data.get('label')
+ name = received_data.get('name')
+ return create_and_train_model(selected_data,selected_label,name) 
+
 if __name__ == '__main__':
  server_thread = threading.Thread(target=run_flask_app)
  server_thread.start()

diff --git a/api/models/nn_fakenews.py b/api/models/nn_fakenews.py
@@ -0,0 +1,185 @@
+import csv
+import re
+import string
+import os
+import pandas as pd
+import numpy as np
+import tensorflow as tf
+import pickle
+from tensorflow.keras import layers
+from tensorflow.keras.layers import TextVectorization
+from tensorflow.keras.models import Sequential
+from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional
+from tensorflow.keras.preprocessing.text import Tokenizer
+from tensorflow.keras.preprocessing.sequence import pad_sequences
+
+df_true = pd.read_csv("Linguifai/api/training_df/True.csv")
+df_fake = pd.read_csv("Linguifai/api/training_df/Fake.csv")
+
+
+df_fake = df_fake.drop(['title', 'subject', 'date'], axis=1)
+df_true = df_true.drop(['title', 'subject', 'date'], axis=1)
+
+def wordopt(text):
+ text = text.lower()
+ text = re.sub('\[.*?\]', '', text)
+ text = re.sub("\\W"," ",text)
+ text = re.sub('https?://\S+|www\.\S+', '', text)
+ text = re.sub('<.*?>+', '', text)
+ text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
+ text = re.sub('\n', '', text)
+ text = re.sub('\w*\d\w*', '', text)
+ return text
+
+df_fake['text'] = df_fake["text"].apply(wordopt)
+df_true['text'] = df_true["text"].apply(wordopt)
+
+df_fake_train = df_fake[:5000]
+df_true_train = df_true[:5000]
+
+df_fake_test = df_fake[5000:10000]
+df_true_test = df_true[5000:10000]
+
+# Diretório para salvar os arquivos de texto
+output_directory = 'LinguifAI/api/training_df/arquivos_texto_treino_nn/fake'
+
+# Certifique-se de que o diretório exista ou crie-o
+os.makedirs(output_directory, exist_ok=True)
+
+# Itera sobre cada linha do DataFrame e salva o texto em um arquivo
+for index, row in df_fake_train.iterrows():
+ # Cria o nome do arquivo baseado no índice da linha
+ filename = os.path.join(output_directory, f'texto_{index}.txt')
+
+ # Abre o arquivo e salva o texto
+ with open(filename, 'w', encoding='utf-8') as file:
+ file.write(row['text'])
+
+print("Arquivos de texto foram criados com sucesso.")
+
+output_directory = 'LinguifAI/api/training_df/arquivos_texto_treino_nn/true'
+
+# Certifique-se de que o diretório exista ou crie-o
+os.makedirs(output_directory, exist_ok=True)
+
+# Itera sobre cada linha do DataFrame e salva o texto em um arquivo
+for index, row in df_true_train.iterrows():
+ # Cria o nome do arquivo baseado no índice da linha
+ filename = os.path.join(output_directory, f'texto_{index}.txt')
+
+ # Abre o arquivo e salva o texto
+ with open(filename, 'w', encoding='utf-8') as file:
+ file.write(row['text'])
+
+
+# Diretório para salvar os arquivos de texto
+output_directory = 'LinguifAI/api/training_df/arquivos_texto_teste_nn/fake'
+
+# Certifique-se de que o diretório exista ou crie-o
+os.makedirs(output_directory, exist_ok=True)
+
+# Itera sobre cada linha do DataFrame e salva o texto em um arquivo
+for index, row in df_fake_test.iterrows():
+ # Cria o nome do arquivo baseado no índice da linha
+ filename = os.path.join(output_directory, f'texto_{index}.txt')
+
+ # Abre o arquivo e salva o texto
+ with open(filename, 'w', encoding='utf-8') as file:
+ file.write(row['text'])
+
+print("Arquivos de texto foram criados com sucesso.")
+
+output_directory = 'LinguifAI/api/training_df/arquivos_texto_teste_nn/true'
+
+# Certifique-se de que o diretório exista ou crie-o
+os.makedirs(output_directory, exist_ok=True)
+
+# Itera sobre cada linha do DataFrame e salva o texto em um arquivo
+for index, row in df_true_test.iterrows():
+ # Cria o nome do arquivo baseado no índice da linha
+ filename = os.path.join(output_directory, f'texto_{index}.txt')
+
+ # Abre o arquivo e salva o texto
+ with open(filename, 'w', encoding='utf-8') as file:
+ file.write(row['text'])
+
+treino_dir = os.path.join("LinguifAI/api/training_df/arquivos_texto_treino_nn")
+teste_dir = os.path.join("LinguifAI/api/training_df/arquivos_texto_teste_nn")
+
+train_dataset = tf.keras.utils.text_dataset_from_directory(
+ treino_dir,
+ batch_size=32,
+ shuffle=True,
+)
+
+test_dataset = tf.keras.utils.text_dataset_from_directory(
+ teste_dir,
+ batch_size=32,
+ shuffle=True,
+)
+
+# Model constants.
+max_features = 20000
+embedding_dim = 128
+sequence_length = 500
+
+vectorize_layer = TextVectorization(
+ max_tokens=max_features,
+ output_mode="int",
+ output_sequence_length=sequence_length,
+)
+
+# Now that the vectorize_layer has been created, call `adapt` on a text-only
+# dataset to create the vocabulary. You don't have to batch, but for very large
+# datasets this means you're not keeping spare copies of the dataset in memory.
+
+# Let's make a text-only dataset (no labels):
+text_ds = train_dataset.map(lambda x, y: x)
+# Let's call `adapt`:
+vectorize_layer.adapt(text_ds)
+
+def vectorize_text(text, label):
+ text = tf.expand_dims(text, -1)
+ return vectorize_layer(text), label
+
+
+# Vectorize the data.
+train_ds = train_dataset.map(vectorize_text)
+test_ds = test_dataset.map(vectorize_text)
+
+# Do async prefetching / buffering of the data for best performance on GPU.
+train_ds = train_ds.cache().prefetch(buffer_size=10)
+test_ds = test_ds.cache().prefetch(buffer_size=10)
+
+# A integer input for vocab indices.
+inputs = tf.keras.Input(shape=(None,), dtype="int64")
+
+# Next, we add a layer to map those vocab indices into a space of dimensionality
+# 'embedding_dim'.
+x = layers.Embedding(max_features, embedding_dim)(inputs)
+x = layers.Dropout(0.5)(x)
+
+# Conv1D + global max pooling
+x = layers.Conv1D(128, 7, padding="valid", activation="relu", strides=3)(x)
+x = layers.Conv1D(128, 7, padding="valid", activation="relu", strides=3)(x)
+x = layers.GlobalMaxPooling1D()(x)
+
+# We add a vanilla hidden layer:
+x = layers.Dense(128, activation="relu")(x)
+x = layers.Dropout(0.5)(x)
+
+# We project onto a single unit output layer, and squash it with a sigmoid:
+predictions = layers.Dense(1, activation="sigmoid", name="predictions")(x)
+
+model = tf.keras.Model(inputs, predictions)
+epochs = 5
+
+# Compile the model with binary crossentropy loss and an adam optimizer.
+model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
+
+# Fit the model using the train and test datasets.
+model.fit(train_ds, epochs=epochs)
+
+# Salvando o pipeline em um arquivo .pkl
+with open("LinguifAI/api/models/nn_fakenews_model.pkl", "wb") as model_file:
+ pickle.dump(model, model_file)
diff --git a/api/models/nn_fakenews_model.pkl b/api/models/nn_fakenews_model.pkl