Skip to content

Commit

Permalink
changes in data reading
Browse files Browse the repository at this point in the history
  • Loading branch information
nilbsongalindo committed Dec 17, 2020
1 parent e37c225 commit d5d40af
Show file tree
Hide file tree
Showing 3 changed files with 35 additions and 59 deletions.
Binary file not shown.
14 changes: 10 additions & 4 deletions volumes/rnamining-front/assets/scripts/counters/arff_creator.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,26 @@
from sys import argv
import sys
import os
from Bio import SeqIO
from Bio import Seq
#call script: python arff_creator.py sequences.fa

def Verification(input_file):
def Verification(input_verification,output_verification):

input_file = open(input_file,"r")
input_file = open(input_verification,"r")
lines = input_file.readlines()

firstline = lines[0]
lastline = lines[-1]

if((not firstline.startswith('>')) or (lastline.startswith('>'))):
sys.exit("Error: The inserted file does not match with the default of fasta file! Check the lines, the header and sequence lines do not match!")



else:
for index, record in enumerate(SeqIO.parse(input_verification, "fasta")):
seed= record.seq
output_verification.writelines(">" + record.description + '\n' + seed + '\n')



def header(dl_input):
Expand Down
80 changes: 25 additions & 55 deletions volumes/rnamining-front/assets/scripts/rnamining.py
Original file line number Diff line number Diff line change
@@ -1,35 +1,34 @@
#from keras.models import load_model
from scipy.io import arff
import numpy as np
import pandas as pd
from sys import argv
#import tensorflow as tf
#tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)
from counters import arff_creator
import argparse
import os
import pickle
import model_train

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

def process_inputfile(filename, organism_name):
def process_inputfile(filename, organism_name,output_folder):
"""
Description: function to process the input file. It generates the header and counts its nucleotides frequency. Then,
it reads the file and uses as input to the network model.
Arguments: filename - the name of input file
organism_name - the name of the organism (e.g.. escherichia_coli, arabidopsis_thaliana)
"""
organism_name - the name of the organism (e.g.. escherichia_coli, arabidopsis_thaliana)
"""
out = organism_name+'.arff'
output_file = open(out, 'w')
arff_creator.Verification(filename)
output_verification = open(output_folder + "/edited_file.fasta", "w")
arff_creator.Verification(filename, output_verification)
output_verification.close()
filename = output_folder + "/edited_file.fasta"
arff_creator.header(output_file)
arff_creator.trinucleotides_counts(filename,output_file)
output_file.close()
data = arff.loadarff(out)
data = pd.DataFrame(data[0])

data = pd.DataFrame(data[0])


#Normalização do tamanho da sequencia
ar = np.array(data)
X = ar.astype(int)
Expand Down Expand Up @@ -57,18 +56,18 @@ def process_outputfile(filename_path, predict, organism_name, prediction_type, o

#The last instance
if(i==(len(predict)-1)):
if predict[i]==1:
out[i] = ids[i] + '\tcoding'
if predict[i]==0:
out[i] = ids[i] + '\tnon-coding'

else:
out[i] = ids[i] + '\tnon-coding'
out[i] = ids[i] + '\tcoding'
else:
#All instances
if predict[i]==1:
out[i] = ids[i] + '\tcoding\n'
if predict[i]==0:
out[i] = ids[i] + '\tnon-coding\n'

else:
out[i] = ids[i] + '\tnon-coding\n'
out[i] = ids[i] + '\tcoding\n'

output_file = open(output_folder+'/predictions.txt', 'w')
output_file.writelines("RNAMining Predictions\n")
Expand All @@ -94,61 +93,32 @@ def process_outputfile(filename_path, predict, organism_name, prediction_type, o
#np.savetxt('predictions.txt',out,delimiter = ",", fmt="%s")
def predict(filename_path, organism_name, prediction_type, output_folder):
"""
Description: function to predict a sequence based on a trained XGBOOST model. The function first process the input file
Description: function to predict a sequence based on a trained CNN model. The function first process the input file
by counting the nucleotides frequency and loading the trained model of the organism. Thereafter, the function returns
the seuqence prediction and generates the output_file.
Arguments: filename - the filename path that contains the sequence
organism_name - the name of the organism (e.g.. homo_sapiens, mus_musculus)
prediction_type - the sequence type (coding_prediction)
organism_name - the name of the organism (e.g.. escherichia_coli, arabidopsis_thaliana)
prediction_type - the sequence type (coding_prediction, ncRNA_functional_assignation)
"""

try:
X = process_inputfile(filename_path, organism_name)
X = process_inputfile(filename_path, organism_name, output_folder)
model = pickle.load(open('models/' + 'coding_prediction/' + organism_name + '.pkl', 'rb'))
predict = model.predict(X)
process_outputfile(filename_path, predict, organism_name, prediction_type,output_folder)

except NameError:
print('Please check if organism_name and prediction_type matches RNAMining documentation.')

def train(filename_path_cod, filename_path_ncod, output_filename):
"""
Description: function to train a XGBOOST model for RNA coding. The function first process the dataset
reading the arff file, separates the input and its targets and trains a XGBOOST model.
Argments: filename_path - the dataset path
output_filename - the path to model saved as pkl file.
"""
model_train.process_inputfile(filename_path_cod, 'cod')
dataset_cds = model_train.process_dataset('cod.arff', True)
os.remove('cod.arff')
model_train.process_inputfile(filename_path_ncod, 'ncod')
dataset_ncod = model_train.process_dataset('ncod.arff', False)
os.remove('ncod.arff')

dataset = model_train.balance(dataset_cds, dataset_ncod)
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1:].values
model_train.xgboost_model(X,y.ravel(), output_filename)

def main():
parser = argparse.ArgumentParser(description='RNAmining: A machine learning stand-alone and web server tool for RNA coding potential prediction')
parser = argparse.ArgumentParser(description='RNAmining: a deep learning stand-alone and web server tool for sequences coding prediction and RNA functional assignation')
parser.add_argument('-f','--filename', help='The filename with a sequence to predict', required=True)
parser.add_argument('-organism_name','--organism_name', help='The name of the organism you want to predict/train. Currently, the following organism names are suported in this tool: Anolis carolinensis, Chrysemys picta bellii, Crocodylus porosus, Danio rerio, Eptatretus burgeri, Gallus gallus, Homo sapiens, Latimeria chalumnae, Monodelphis domestica, Mus musculus, Notechis scutatus, Ornithorhynchus anatinus, Petromyzon marinus, Sphenodon punctatus, Xenopus tropicalis', required=True)
parser.add_argument('-p','--predict', help='Boolean flag to perform predictions. Set True if you want to predict a sequence or false if you want to train', default=True)
parser.add_argument('-prediction_type','--prediction_type', help='The type of the sequence prediction (coding_prediction)', required=True)
parser.add_argument('-n','--ncod', help='The filename with all the non-coding sequences if the user wants to train a new model.')
parser.add_argument('-out','--output_filename', help='The output path and file name for the RNAmining model')
parser.add_argument('-output_folder', '--output_folder', help='The output folder with the prediction results')
parser.add_argument('-organism_name','--organism_name', help='The name of the organism you want to predict/train. Currently, the following organism names are suported in this tool: escherichia_coli, arabidopsis_thaliana, drosophila_melanogaster, homo_sapiens, mus_musculus, saccharomyces_cerevisiae', required=True)
parser.add_argument('-prediction_type','--prediction_type', help='The type of the sequence (coding_prediction, ncRNA_functional_assignation)', required=True)
parser.add_argument('-output_folder', '--output_folder', help='The output folder',required= True)
args = vars(parser.parse_args())


if args['predict']==True:
predict(args['filename'], args['organism_name'], args['prediction_type'], args['output_folder'])
else:
print(args['ncod'])
if args['ncod'] == None:
raise ValueError("For model training, the -n parameter is required to insert the non-coding sequences.")
train(args['filename'], args['ncod'], args['output_filename'])
predict(args['filename'], args['organism_name'], args['prediction_type'], args['output_folder'])


main()

0 comments on commit d5d40af

Please sign in to comment.