-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #4 from TailUFPB/nn-fakenews
Nn fakenews
- Loading branch information
Showing
20,010 changed files
with
65,263 additions
and
1 deletion.
The diff you're trying to view is too large. We only load the first 3000 changed files.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
import pandas as pd | ||
import pickle | ||
|
||
def nn_fakenews_precition(texts): | ||
model_file = "api/models/nn_fakenews_model.pkl" | ||
try: | ||
# Carregando o pipeline do arquivo .pkl | ||
with open(model_file, 'rb') as model_file: | ||
pipeline = pickle.load(model_file) | ||
|
||
# Fazendo previsões para os textos | ||
predictions = pipeline.predict(texts) | ||
|
||
return predictions | ||
|
||
except Exception as e: | ||
return str(e) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,143 @@ | ||
import csv | ||
import json | ||
import re | ||
import string | ||
import os | ||
import pandas as pd | ||
import numpy as np | ||
import tensorflow as tf | ||
import pickle | ||
from tensorflow.keras import layers | ||
from tensorflow.keras.layers import TextVectorization | ||
from tensorflow.keras.models import Sequential | ||
|
||
def preprocess_text(text): | ||
text = text.lower() | ||
text = re.sub('\[.*?\]', '', text) | ||
text = re.sub("\\W", " ", text) | ||
text = re.sub('https?://\S+|www\.\S+', '', text) | ||
text = re.sub('<.*?>+', '', text) | ||
text = re.sub('[%s]' % re.escape(string.punctuation), '', text) | ||
text = re.sub('\n', '', text) | ||
text = re.sub('\w*\d\w*', '', text) | ||
return text | ||
|
||
def create_and_train_model(train_texts, train_labels,name, epochs=5): | ||
# Cria um DataFrame do Pandas com os textos e rótulos de treino | ||
train_df = pd.DataFrame({'text': train_texts, 'label': train_labels}) | ||
# Aplica o pré-processamento aos textos | ||
train_df['text'] = train_df['text'].apply(preprocess_text) | ||
|
||
# Cria um diretório para armazenar os textos de treino como arquivos individuais | ||
output_directory = 'arquivos_texto_treino_nn' | ||
os.makedirs(output_directory, exist_ok=True) | ||
|
||
# Escreve cada texto em um arquivo separado dentro do diretório criado | ||
for index, row in train_df.iterrows(): | ||
filename = os.path.join(output_directory, f'texto_{index}.txt') | ||
with open(filename, 'w', encoding='utf-8') as file: | ||
file.write(row['text']) | ||
|
||
# Diretório de treino para o modelo | ||
treino_dir = output_directory | ||
|
||
# Cria um conjunto de dados de texto usando a API de conjuntos de dados do TensorFlow | ||
train_dataset = tf.keras.utils.text_dataset_from_directory( | ||
treino_dir, | ||
batch_size=32, | ||
shuffle=True, | ||
) | ||
|
||
# Parâmetros do modelo | ||
max_features = 20000 | ||
embedding_dim = 128 | ||
sequence_length = 500 | ||
|
||
# Cria uma camada de vetorização de texto | ||
vectorize_layer = TextVectorization( | ||
max_tokens=max_features, | ||
output_mode="int", | ||
output_sequence_length=sequence_length, | ||
) | ||
|
||
# Adapta a camada de vetorização ao conjunto de dados de texto | ||
text_ds = train_dataset.map(lambda x, y: x) | ||
vectorize_layer.adapt(text_ds) | ||
|
||
# Função para vetorizar o texto e manter os rótulos | ||
def vectorize_text(text, label): | ||
text = tf.expand_dims(text, -1) | ||
return vectorize_layer(text), label | ||
|
||
# Aplica a vetorização ao conjunto de dados de treino | ||
train_ds = train_dataset.map(vectorize_text) | ||
train_ds = train_ds.cache().prefetch(buffer_size=10) | ||
|
||
# Define a arquitetura do modelo | ||
inputs = tf.keras.Input(shape=(None,), dtype="int64") | ||
x = layers.Embedding(max_features, embedding_dim)(inputs) | ||
x = layers.Dropout(0.5)(x) | ||
x = layers.Conv1D(128, 7, padding="valid", activation="relu", strides=3)(x) | ||
x = layers.Conv1D(128, 7, padding="valid", activation="relu", strides=3)(x) | ||
x = layers.GlobalMaxPooling1D()(x) | ||
x = layers.Dense(128, activation="relu")(x) | ||
x = layers.Dropout(0.5)(x) | ||
predictions = layers.Dense(1, activation="sigmoid", name="predictions")(x) | ||
|
||
# Cria e compila o modelo | ||
model = tf.keras.Model(inputs, predictions) | ||
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"]) | ||
|
||
# Treina o modelo | ||
history = model.fit(train_ds, epochs=epochs) | ||
# Nome do arquivo | ||
name = f"nn_{name}.pkl" | ||
# Saving the model | ||
with open(name, "wb") as model_file: | ||
pickle.dump(model, model_file) | ||
|
||
# Obtém estatísticas do treinamento | ||
training_stats = { | ||
"loss": history.history['loss'], | ||
"accuracy": history.history['accuracy'] | ||
} | ||
|
||
# Retorna estatísticas como JSON | ||
return json.dumps(training_stats) | ||
|
||
''' | ||
Com o nome do arquivo podemos fazer por exemplo: | ||
saved_model_filename = Neural_Network(texts_train, labels_train) | ||
Carregar o modelo treinado a partir do arquivo: | ||
with open(saved_model_filename, "rb") as model_file: | ||
loaded_model = pickle.load(model_file) | ||
Agora, podemos usar loaded_model para fazer previsões, por exemplo: | ||
predictions = loaded_model.predict(new_texts) | ||
''' | ||
''' | ||
TESTE: | ||
df_true = pd.read_csv("Linguifai/api/training_df/True.csv") | ||
df_fake = pd.read_csv("Linguifai/api/training_df/Fake.csv") | ||
df_fake = df_fake.drop(['title', 'subject', 'date'], axis=1) | ||
df_true = df_true.drop(['title', 'subject', 'date'], axis=1) | ||
df_fake['text'] = df_fake["text"] | ||
df_true['text'] = df_true["text"] | ||
df_fake_train = df_fake[:5000] | ||
df_true_train = df_true[:5000] | ||
textos = df_fake_train['text'].tolist() + df_true_train['text'].tolist() | ||
labels = [0] * len(df_fake_train) + [1] * len(df_true_train) | ||
create_and_train_model(textos,labels,"Teste") | ||
''' |
Binary file not shown.
Binary file not shown.
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,185 @@ | ||
import csv | ||
import re | ||
import string | ||
import os | ||
import pandas as pd | ||
import numpy as np | ||
import tensorflow as tf | ||
import pickle | ||
from tensorflow.keras import layers | ||
from tensorflow.keras.layers import TextVectorization | ||
from tensorflow.keras.models import Sequential | ||
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional | ||
from tensorflow.keras.preprocessing.text import Tokenizer | ||
from tensorflow.keras.preprocessing.sequence import pad_sequences | ||
|
||
df_true = pd.read_csv("Linguifai/api/training_df/True.csv") | ||
df_fake = pd.read_csv("Linguifai/api/training_df/Fake.csv") | ||
|
||
|
||
df_fake = df_fake.drop(['title', 'subject', 'date'], axis=1) | ||
df_true = df_true.drop(['title', 'subject', 'date'], axis=1) | ||
|
||
def wordopt(text): | ||
text = text.lower() | ||
text = re.sub('\[.*?\]', '', text) | ||
text = re.sub("\\W"," ",text) | ||
text = re.sub('https?://\S+|www\.\S+', '', text) | ||
text = re.sub('<.*?>+', '', text) | ||
text = re.sub('[%s]' % re.escape(string.punctuation), '', text) | ||
text = re.sub('\n', '', text) | ||
text = re.sub('\w*\d\w*', '', text) | ||
return text | ||
|
||
df_fake['text'] = df_fake["text"].apply(wordopt) | ||
df_true['text'] = df_true["text"].apply(wordopt) | ||
|
||
df_fake_train = df_fake[:5000] | ||
df_true_train = df_true[:5000] | ||
|
||
df_fake_test = df_fake[5000:10000] | ||
df_true_test = df_true[5000:10000] | ||
|
||
# Diretório para salvar os arquivos de texto | ||
output_directory = 'LinguifAI/api/training_df/arquivos_texto_treino_nn/fake' | ||
|
||
# Certifique-se de que o diretório exista ou crie-o | ||
os.makedirs(output_directory, exist_ok=True) | ||
|
||
# Itera sobre cada linha do DataFrame e salva o texto em um arquivo | ||
for index, row in df_fake_train.iterrows(): | ||
# Cria o nome do arquivo baseado no índice da linha | ||
filename = os.path.join(output_directory, f'texto_{index}.txt') | ||
|
||
# Abre o arquivo e salva o texto | ||
with open(filename, 'w', encoding='utf-8') as file: | ||
file.write(row['text']) | ||
|
||
print("Arquivos de texto foram criados com sucesso.") | ||
|
||
output_directory = 'LinguifAI/api/training_df/arquivos_texto_treino_nn/true' | ||
|
||
# Certifique-se de que o diretório exista ou crie-o | ||
os.makedirs(output_directory, exist_ok=True) | ||
|
||
# Itera sobre cada linha do DataFrame e salva o texto em um arquivo | ||
for index, row in df_true_train.iterrows(): | ||
# Cria o nome do arquivo baseado no índice da linha | ||
filename = os.path.join(output_directory, f'texto_{index}.txt') | ||
|
||
# Abre o arquivo e salva o texto | ||
with open(filename, 'w', encoding='utf-8') as file: | ||
file.write(row['text']) | ||
|
||
|
||
# Diretório para salvar os arquivos de texto | ||
output_directory = 'LinguifAI/api/training_df/arquivos_texto_teste_nn/fake' | ||
|
||
# Certifique-se de que o diretório exista ou crie-o | ||
os.makedirs(output_directory, exist_ok=True) | ||
|
||
# Itera sobre cada linha do DataFrame e salva o texto em um arquivo | ||
for index, row in df_fake_test.iterrows(): | ||
# Cria o nome do arquivo baseado no índice da linha | ||
filename = os.path.join(output_directory, f'texto_{index}.txt') | ||
|
||
# Abre o arquivo e salva o texto | ||
with open(filename, 'w', encoding='utf-8') as file: | ||
file.write(row['text']) | ||
|
||
print("Arquivos de texto foram criados com sucesso.") | ||
|
||
output_directory = 'LinguifAI/api/training_df/arquivos_texto_teste_nn/true' | ||
|
||
# Certifique-se de que o diretório exista ou crie-o | ||
os.makedirs(output_directory, exist_ok=True) | ||
|
||
# Itera sobre cada linha do DataFrame e salva o texto em um arquivo | ||
for index, row in df_true_test.iterrows(): | ||
# Cria o nome do arquivo baseado no índice da linha | ||
filename = os.path.join(output_directory, f'texto_{index}.txt') | ||
|
||
# Abre o arquivo e salva o texto | ||
with open(filename, 'w', encoding='utf-8') as file: | ||
file.write(row['text']) | ||
|
||
treino_dir = os.path.join("LinguifAI/api/training_df/arquivos_texto_treino_nn") | ||
teste_dir = os.path.join("LinguifAI/api/training_df/arquivos_texto_teste_nn") | ||
|
||
train_dataset = tf.keras.utils.text_dataset_from_directory( | ||
treino_dir, | ||
batch_size=32, | ||
shuffle=True, | ||
) | ||
|
||
test_dataset = tf.keras.utils.text_dataset_from_directory( | ||
teste_dir, | ||
batch_size=32, | ||
shuffle=True, | ||
) | ||
|
||
# Model constants. | ||
max_features = 20000 | ||
embedding_dim = 128 | ||
sequence_length = 500 | ||
|
||
vectorize_layer = TextVectorization( | ||
max_tokens=max_features, | ||
output_mode="int", | ||
output_sequence_length=sequence_length, | ||
) | ||
|
||
# Now that the vectorize_layer has been created, call `adapt` on a text-only | ||
# dataset to create the vocabulary. You don't have to batch, but for very large | ||
# datasets this means you're not keeping spare copies of the dataset in memory. | ||
|
||
# Let's make a text-only dataset (no labels): | ||
text_ds = train_dataset.map(lambda x, y: x) | ||
# Let's call `adapt`: | ||
vectorize_layer.adapt(text_ds) | ||
|
||
def vectorize_text(text, label): | ||
text = tf.expand_dims(text, -1) | ||
return vectorize_layer(text), label | ||
|
||
|
||
# Vectorize the data. | ||
train_ds = train_dataset.map(vectorize_text) | ||
test_ds = test_dataset.map(vectorize_text) | ||
|
||
# Do async prefetching / buffering of the data for best performance on GPU. | ||
train_ds = train_ds.cache().prefetch(buffer_size=10) | ||
test_ds = test_ds.cache().prefetch(buffer_size=10) | ||
|
||
# A integer input for vocab indices. | ||
inputs = tf.keras.Input(shape=(None,), dtype="int64") | ||
|
||
# Next, we add a layer to map those vocab indices into a space of dimensionality | ||
# 'embedding_dim'. | ||
x = layers.Embedding(max_features, embedding_dim)(inputs) | ||
x = layers.Dropout(0.5)(x) | ||
|
||
# Conv1D + global max pooling | ||
x = layers.Conv1D(128, 7, padding="valid", activation="relu", strides=3)(x) | ||
x = layers.Conv1D(128, 7, padding="valid", activation="relu", strides=3)(x) | ||
x = layers.GlobalMaxPooling1D()(x) | ||
|
||
# We add a vanilla hidden layer: | ||
x = layers.Dense(128, activation="relu")(x) | ||
x = layers.Dropout(0.5)(x) | ||
|
||
# We project onto a single unit output layer, and squash it with a sigmoid: | ||
predictions = layers.Dense(1, activation="sigmoid", name="predictions")(x) | ||
|
||
model = tf.keras.Model(inputs, predictions) | ||
epochs = 5 | ||
|
||
# Compile the model with binary crossentropy loss and an adam optimizer. | ||
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"]) | ||
|
||
# Fit the model using the train and test datasets. | ||
model.fit(train_ds, epochs=epochs) | ||
|
||
# Salvando o pipeline em um arquivo .pkl | ||
with open("LinguifAI/api/models/nn_fakenews_model.pkl", "wb") as model_file: | ||
pickle.dump(model, model_file) |
Binary file not shown.
Oops, something went wrong.