-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathlncRNA_predict.py
103 lines (82 loc) · 2.72 KB
/
lncRNA_predict.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
# -*- coding: utf-8 -*-
"""
Created on Sun Jul 21 14:53:57 2019
@author: YudongCai
@Email: [email protected]
"""
import os
os.environ['KERAS_BACKEND']='tensorflow'
import tensorflow as tf
import click
import khmer
import numpy as np
import pandas as pd
from itertools import product
from keras.models import load_model
def tf_no_warning():
"""
Make Tensorflow less verbose
"""
try:
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
except ImportError:
pass
def kmerproduce(infile, k, outfile):
kmerlist = []
for kmer in product('ACGT', repeat=k):
kmerlist.append(''.join(kmer))
df = []
seqids = []
for seq in khmer.ReadParser(infile):
seqids.append(seq.name)
df.append([])
counts = khmer.Counttable(k, 1e6, 4)
counts.set_use_bigcount(True)
counts.consume(seq.sequence.upper())
for kmer in kmerlist:
df[-1].append(counts.get(kmer))
df = pd.DataFrame(df, columns=kmerlist, index=seqids)
df.T.reset_index().to_csv(outfile, index=False, sep='\t')
def load_features(outdir, outprefix):
alldata = []
for k in range(1, 6):
print(f'k = {k}')
file = os.path.join(outdir, f'{outprefix}_{k}mer.tsv.gz')
df = pd.read_csv(file, sep='\t').set_index('index')
df = df / df.sum()
data = df.T.values
alldata.append(data)
return np.hstack(alldata)
def load_seqid(infile):
seqids = []
with open(infile) as f:
for line in f:
if line[0] == '>':
seqids.append(line.strip().split()[0][1:])
return seqids
@click.command()
@click.option('--infile', help='输入的fasta文件路径')
@click.option('--modelfile', help='模型路径')
@click.option('--outdir', help='输出文件目录')
@click.option('--outprefix', help='输出文件名前缀')
def main(infile, modelfile, outdir, outprefix):
tf_no_warning()
print('kmer producing...')
for k in range(1, 6):
print(f'k = {k}')
outfile = os.path.join(outdir, f'{outprefix}_{k}mer.tsv.gz')
kmerproduce(infile, k, outfile)
print('loading features...')
data = load_features(outdir, outprefix)
print('loading model')
model = load_model(modelfile)
print('predicting')
labels = model.predict_classes(data).flatten()
seqids = load_seqid(infile)
df = pd.DataFrame({'seqid': seqids, 'label': labels})
outfile = os.path.join(outdir, f'{outprefix}_result.tsv.gz')
df.to_csv(outfile, sep='\t', index=False)
print('Completed!')
if __name__ == '__main__':
main()