Skip to content

Commit

Permalink
Merge pull request #4 from TailUFPB/nn-fakenews
Browse files Browse the repository at this point in the history
Nn fakenews
  • Loading branch information
jonasgabriel18 authored Dec 8, 2023
2 parents ee637b2 + 1f24f07 commit a9fcff4
Show file tree
Hide file tree
Showing 20,010 changed files with 65,263 additions and 1 deletion.
The diff you're trying to view is too large. We only load the first 3000 changed files.
17 changes: 17 additions & 0 deletions api/NeuralNetworkFakeNews
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
import pandas as pd
import pickle

def nn_fakenews_precition(texts):
model_file = "api/models/nn_fakenews_model.pkl"
try:
# Carregando o pipeline do arquivo .pkl
with open(model_file, 'rb') as model_file:
pipeline = pickle.load(model_file)

# Fazendo previsões para os textos
predictions = pipeline.predict(texts)

return predictions

except Exception as e:
return str(e)
143 changes: 143 additions & 0 deletions api/Neural_Network2.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,143 @@
import csv
import json
import re
import string
import os
import pandas as pd
import numpy as np
import tensorflow as tf
import pickle
from tensorflow.keras import layers
from tensorflow.keras.layers import TextVectorization
from tensorflow.keras.models import Sequential

def preprocess_text(text):
text = text.lower()
text = re.sub('\[.*?\]', '', text)
text = re.sub("\\W", " ", text)
text = re.sub('https?://\S+|www\.\S+', '', text)
text = re.sub('<.*?>+', '', text)
text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
text = re.sub('\n', '', text)
text = re.sub('\w*\d\w*', '', text)
return text

def create_and_train_model(train_texts, train_labels,name, epochs=5):
# Cria um DataFrame do Pandas com os textos e rótulos de treino
train_df = pd.DataFrame({'text': train_texts, 'label': train_labels})
# Aplica o pré-processamento aos textos
train_df['text'] = train_df['text'].apply(preprocess_text)

# Cria um diretório para armazenar os textos de treino como arquivos individuais
output_directory = 'arquivos_texto_treino_nn'
os.makedirs(output_directory, exist_ok=True)

# Escreve cada texto em um arquivo separado dentro do diretório criado
for index, row in train_df.iterrows():
filename = os.path.join(output_directory, f'texto_{index}.txt')
with open(filename, 'w', encoding='utf-8') as file:
file.write(row['text'])

# Diretório de treino para o modelo
treino_dir = output_directory

# Cria um conjunto de dados de texto usando a API de conjuntos de dados do TensorFlow
train_dataset = tf.keras.utils.text_dataset_from_directory(
treino_dir,
batch_size=32,
shuffle=True,
)

# Parâmetros do modelo
max_features = 20000
embedding_dim = 128
sequence_length = 500

# Cria uma camada de vetorização de texto
vectorize_layer = TextVectorization(
max_tokens=max_features,
output_mode="int",
output_sequence_length=sequence_length,
)

# Adapta a camada de vetorização ao conjunto de dados de texto
text_ds = train_dataset.map(lambda x, y: x)
vectorize_layer.adapt(text_ds)

# Função para vetorizar o texto e manter os rótulos
def vectorize_text(text, label):
text = tf.expand_dims(text, -1)
return vectorize_layer(text), label

# Aplica a vetorização ao conjunto de dados de treino
train_ds = train_dataset.map(vectorize_text)
train_ds = train_ds.cache().prefetch(buffer_size=10)

# Define a arquitetura do modelo
inputs = tf.keras.Input(shape=(None,), dtype="int64")
x = layers.Embedding(max_features, embedding_dim)(inputs)
x = layers.Dropout(0.5)(x)
x = layers.Conv1D(128, 7, padding="valid", activation="relu", strides=3)(x)
x = layers.Conv1D(128, 7, padding="valid", activation="relu", strides=3)(x)
x = layers.GlobalMaxPooling1D()(x)
x = layers.Dense(128, activation="relu")(x)
x = layers.Dropout(0.5)(x)
predictions = layers.Dense(1, activation="sigmoid", name="predictions")(x)

# Cria e compila o modelo
model = tf.keras.Model(inputs, predictions)
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

# Treina o modelo
history = model.fit(train_ds, epochs=epochs)
# Nome do arquivo
name = f"nn_{name}.pkl"
# Saving the model
with open(name, "wb") as model_file:
pickle.dump(model, model_file)

# Obtém estatísticas do treinamento
training_stats = {
"loss": history.history['loss'],
"accuracy": history.history['accuracy']
}

# Retorna estatísticas como JSON
return json.dumps(training_stats)

'''
Com o nome do arquivo podemos fazer por exemplo:
saved_model_filename = Neural_Network(texts_train, labels_train)
Carregar o modelo treinado a partir do arquivo:
with open(saved_model_filename, "rb") as model_file:
loaded_model = pickle.load(model_file)
Agora, podemos usar loaded_model para fazer previsões, por exemplo:
predictions = loaded_model.predict(new_texts)
'''
'''
TESTE:
df_true = pd.read_csv("Linguifai/api/training_df/True.csv")
df_fake = pd.read_csv("Linguifai/api/training_df/Fake.csv")
df_fake = df_fake.drop(['title', 'subject', 'date'], axis=1)
df_true = df_true.drop(['title', 'subject', 'date'], axis=1)
df_fake['text'] = df_fake["text"]
df_true['text'] = df_true["text"]
df_fake_train = df_fake[:5000]
df_true_train = df_true[:5000]
textos = df_fake_train['text'].tolist() + df_true_train['text'].tolist()
labels = [0] * len(df_fake_train) + [1] * len(df_true_train)
create_and_train_model(textos,labels,"Teste")
'''
Binary file added api/__pycache__/DataProcesser.cpython-39.pyc
Binary file not shown.
Binary file added api/__pycache__/NbEmotionsModel.cpython-39.pyc
Binary file not shown.
Binary file added api/__pycache__/NbNewsModel.cpython-39.pyc
Binary file not shown.
11 changes: 10 additions & 1 deletion api/app.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,14 @@
from flask import Flask, jsonify, request
from flask_cors import CORS
from DataProcesser import DataProcesser
from available_classifiers import get_available_classifiers

from available_classifiers import get_available_classifiers
import os
import atexit
import threading
import pandas as pd
import nltk
import pandas as pd
import json
nltk.download('wordnet')

Expand Down Expand Up @@ -54,6 +55,14 @@ def shutdown():
shutdown_server()
return 'Server shutting down...'

@app.route('/neural-network',methods=["POST"])
def train_model():
received_data = request.get_json()
selected_data = received_data.get('data')
selected_label = received_data.get('label')
name = received_data.get('name')
return create_and_train_model(selected_data,selected_label,name)

if __name__ == '__main__':
server_thread = threading.Thread(target=run_flask_app)
server_thread.start()
Expand Down
185 changes: 185 additions & 0 deletions api/models/nn_fakenews.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,185 @@
import csv
import re
import string
import os
import pandas as pd
import numpy as np
import tensorflow as tf
import pickle
from tensorflow.keras import layers
from tensorflow.keras.layers import TextVectorization
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

df_true = pd.read_csv("Linguifai/api/training_df/True.csv")
df_fake = pd.read_csv("Linguifai/api/training_df/Fake.csv")


df_fake = df_fake.drop(['title', 'subject', 'date'], axis=1)
df_true = df_true.drop(['title', 'subject', 'date'], axis=1)

def wordopt(text):
text = text.lower()
text = re.sub('\[.*?\]', '', text)
text = re.sub("\\W"," ",text)
text = re.sub('https?://\S+|www\.\S+', '', text)
text = re.sub('<.*?>+', '', text)
text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
text = re.sub('\n', '', text)
text = re.sub('\w*\d\w*', '', text)
return text

df_fake['text'] = df_fake["text"].apply(wordopt)
df_true['text'] = df_true["text"].apply(wordopt)

df_fake_train = df_fake[:5000]
df_true_train = df_true[:5000]

df_fake_test = df_fake[5000:10000]
df_true_test = df_true[5000:10000]

# Diretório para salvar os arquivos de texto
output_directory = 'LinguifAI/api/training_df/arquivos_texto_treino_nn/fake'

# Certifique-se de que o diretório exista ou crie-o
os.makedirs(output_directory, exist_ok=True)

# Itera sobre cada linha do DataFrame e salva o texto em um arquivo
for index, row in df_fake_train.iterrows():
# Cria o nome do arquivo baseado no índice da linha
filename = os.path.join(output_directory, f'texto_{index}.txt')

# Abre o arquivo e salva o texto
with open(filename, 'w', encoding='utf-8') as file:
file.write(row['text'])

print("Arquivos de texto foram criados com sucesso.")

output_directory = 'LinguifAI/api/training_df/arquivos_texto_treino_nn/true'

# Certifique-se de que o diretório exista ou crie-o
os.makedirs(output_directory, exist_ok=True)

# Itera sobre cada linha do DataFrame e salva o texto em um arquivo
for index, row in df_true_train.iterrows():
# Cria o nome do arquivo baseado no índice da linha
filename = os.path.join(output_directory, f'texto_{index}.txt')

# Abre o arquivo e salva o texto
with open(filename, 'w', encoding='utf-8') as file:
file.write(row['text'])


# Diretório para salvar os arquivos de texto
output_directory = 'LinguifAI/api/training_df/arquivos_texto_teste_nn/fake'

# Certifique-se de que o diretório exista ou crie-o
os.makedirs(output_directory, exist_ok=True)

# Itera sobre cada linha do DataFrame e salva o texto em um arquivo
for index, row in df_fake_test.iterrows():
# Cria o nome do arquivo baseado no índice da linha
filename = os.path.join(output_directory, f'texto_{index}.txt')

# Abre o arquivo e salva o texto
with open(filename, 'w', encoding='utf-8') as file:
file.write(row['text'])

print("Arquivos de texto foram criados com sucesso.")

output_directory = 'LinguifAI/api/training_df/arquivos_texto_teste_nn/true'

# Certifique-se de que o diretório exista ou crie-o
os.makedirs(output_directory, exist_ok=True)

# Itera sobre cada linha do DataFrame e salva o texto em um arquivo
for index, row in df_true_test.iterrows():
# Cria o nome do arquivo baseado no índice da linha
filename = os.path.join(output_directory, f'texto_{index}.txt')

# Abre o arquivo e salva o texto
with open(filename, 'w', encoding='utf-8') as file:
file.write(row['text'])

treino_dir = os.path.join("LinguifAI/api/training_df/arquivos_texto_treino_nn")
teste_dir = os.path.join("LinguifAI/api/training_df/arquivos_texto_teste_nn")

train_dataset = tf.keras.utils.text_dataset_from_directory(
treino_dir,
batch_size=32,
shuffle=True,
)

test_dataset = tf.keras.utils.text_dataset_from_directory(
teste_dir,
batch_size=32,
shuffle=True,
)

# Model constants.
max_features = 20000
embedding_dim = 128
sequence_length = 500

vectorize_layer = TextVectorization(
max_tokens=max_features,
output_mode="int",
output_sequence_length=sequence_length,
)

# Now that the vectorize_layer has been created, call `adapt` on a text-only
# dataset to create the vocabulary. You don't have to batch, but for very large
# datasets this means you're not keeping spare copies of the dataset in memory.

# Let's make a text-only dataset (no labels):
text_ds = train_dataset.map(lambda x, y: x)
# Let's call `adapt`:
vectorize_layer.adapt(text_ds)

def vectorize_text(text, label):
text = tf.expand_dims(text, -1)
return vectorize_layer(text), label


# Vectorize the data.
train_ds = train_dataset.map(vectorize_text)
test_ds = test_dataset.map(vectorize_text)

# Do async prefetching / buffering of the data for best performance on GPU.
train_ds = train_ds.cache().prefetch(buffer_size=10)
test_ds = test_ds.cache().prefetch(buffer_size=10)

# A integer input for vocab indices.
inputs = tf.keras.Input(shape=(None,), dtype="int64")

# Next, we add a layer to map those vocab indices into a space of dimensionality
# 'embedding_dim'.
x = layers.Embedding(max_features, embedding_dim)(inputs)
x = layers.Dropout(0.5)(x)

# Conv1D + global max pooling
x = layers.Conv1D(128, 7, padding="valid", activation="relu", strides=3)(x)
x = layers.Conv1D(128, 7, padding="valid", activation="relu", strides=3)(x)
x = layers.GlobalMaxPooling1D()(x)

# We add a vanilla hidden layer:
x = layers.Dense(128, activation="relu")(x)
x = layers.Dropout(0.5)(x)

# We project onto a single unit output layer, and squash it with a sigmoid:
predictions = layers.Dense(1, activation="sigmoid", name="predictions")(x)

model = tf.keras.Model(inputs, predictions)
epochs = 5

# Compile the model with binary crossentropy loss and an adam optimizer.
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

# Fit the model using the train and test datasets.
model.fit(train_ds, epochs=epochs)

# Salvando o pipeline em um arquivo .pkl
with open("LinguifAI/api/models/nn_fakenews_model.pkl", "wb") as model_file:
pickle.dump(model, model_file)
Binary file added api/models/nn_fakenews_model.pkl
Binary file not shown.
Loading

0 comments on commit a9fcff4

Please sign in to comment.