-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathRandom_transcription.py
143 lines (109 loc) · 5.11 KB
/
Random_transcription.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
# This script asks things about what happens if you have random transcription of the genome
import os
import sys
from Bio import SeqIO
import numpy as np
import random
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
# add modules
sys.path.append('/Volumes/GoogleDrive/Mi unidad/CareyLab/Miki/Python_modules/')
import CodonBias_QC_Functions_and_classes as CB
from GFF3parser import *
# define the plot to do, from command-line
type_plot = sys.argv[1]
if type_plot not in ['length_ORFs','tAI_in_random','nTE_in_random','tAI_in_random_weight']:
type_plot = 'tAI_in_random_weight'
########## compare tAI and CAI of native vs random genes ###############
orf_coding_path = '/Users/miki_schikora_tamarit/Google Drive File Stream/Mi unidad/CareyLab/ExternalData/Yeast/S288C_reference_genome_R64-2-1_20150113/orf_coding_all_R64-2-1_20150113.fasta'
transcripts_path = '/Users/miki_schikora_tamarit/Google Drive File Stream/Mi unidad/CareyLab/ExternalData/Yeast/S288C_reference_genome_R64-2-1_20150113/orf_trans_all_R64-2-1_20150113.fasta'
#RNAexpression_path = '~/Google Drive File Stream/Mi unidad/CareyLab/ExternalData/vanDijk15/FitFlowRNAseq.tab'
#RNAexpression_path = '~/Google Drive File Stream/Mi unidad/CareyLab/ExternalData/vanDijk15/FitFlow__Fast_Slow.tab'
RNAexpression_path = '~/Google Drive File Stream/Mi unidad/CareyLab/ExternalData/Carey15/Carey15_RNAseq.tab'
genome_path = '/Users/miki_schikora_tamarit/Google Drive File Stream/Mi unidad/CareyLab/ExternalData/Yeast/S288C_reference_genome_R64-2-1_20150113/genome.fasta'
# generate the tAI for random ORFs
# generate a set of orfs comming from random positions in the genome, through all chromosomes
n_orfs = 100 # number of orfs to generate of each chromosome
tAI_random = []
nTE_random = []
for chr in SeqIO.parse(genome_path, "fasta"):
print('generating random ORFs in chr %s'%(chr.id))
chr_seq = chr.seq._data
# generate random orfs n_orfs times:
for n in range(0,n_orfs):
tSS = random.randint(0,len(chr_seq))# simulate a transcription start site. It's an index, so it starts with 0
tAI_random.append(CB.calc_tAI(CB.get_first_orf(chr_seq,tSS)))# generate the random orf
nTE_random.append(CB.calc_tAI(CB.get_first_orf(chr_seq, tSS), type='nTE'))
tAI_random = np.array(tAI_random)
tAI_random = tAI_random[tAI_random>0]
nTE_random = np.array(nTE_random)
nTE_random = nTE_random[nTE_random>0]
# record the distribution of ORF length
length_genome = np.array([len(o.seq._data)/3 for o in SeqIO.parse(orf_coding_path, "fasta")])
# assign an expression weight to each gene
#Expression = pd.read_table(RNAexpression_path).set_index('TranscriptID')['FastFPKM'].to_dict()
#Expression = pd.read_table(RNAexpression_path,names=['ORF','name','position','FastFPKM','SlowFPKM']).set_index('ORF')['FastFPKM'].to_dict()
Expression = pd.read_table(RNAexpression_path).groupby('target_id').sum().to_dict()['tpm']
# calc tAI for native ORFs
tAI_genome = np.array(list(map(CB.calc_tAI, SeqIO.parse(orf_coding_path, "fasta"))))
tAI_genome = tAI_genome[tAI_genome>0]
# calc weighted tAI
tAI_genome_weight = []
n_orfs_added=0
for orf in SeqIO.parse(orf_coding_path, "fasta"):
id = orf.id
try:
expression_w = Expression[id]
tAI_genome_weight += [CB.calc_tAI(orf)]*int(expression_w*1)
n_orfs_added += 1
except KeyError:
pass
tAI_genome_weight = np.array(tAI_genome_weight)
tAI_genome_weight = tAI_genome_weight[tAI_genome_weight>0]
nTE_genome = np.array([CB.calc_tAI(s,type='nTE') for s in SeqIO.parse(orf_coding_path, "fasta")])
nTE_genome = nTE_genome[nTE_genome>0]
# plot genome length
if type_plot == 'length_ORFs':
plt.close()
sns.set(style="white", palette="muted", color_codes=True)
sns.distplot(length_genome, color="r", bins=400, hist=True)
plt.tight_layout()
plt.xlabel('nuber of codons')
plt.ylabel('gene occurence')
plt.show()
if type_plot == 'tAI_in_random':
#plot of tAI
plt.close()
sns.set(style="white", palette="muted", color_codes=True)
sns.distplot(tAI_genome, color="m", label='native genes')
sns.distplot(tAI_random, color="b", label='random ORFs')
plt.tight_layout()
plt.xlabel('tRNA adaptation index (tAI)')
plt.ylabel('frequency density')
plt.legend()
plt.show()
if type_plot == 'tAI_in_random_weight':
#plot of tAI
plt.close()
sns.set(style="white", palette="muted", color_codes=True)
sns.distplot(tAI_genome_weight, color="m", label='native genes')
sns.distplot(tAI_random, color="b", label='random ORFs')
#plt.tight_layout()
plt.xlabel('tRNA adaptation index (tAI)')
plt.ylabel('frequency density')
plt.legend()
plt.show(format='pdf')
plt.savefig("graph.pdf",format='pdf')
if type_plot == 'nTE_in_random':
#plot of nTE
plt.close()
sns.set(style="white", palette="muted", color_codes=True)
sns.distplot(nTE_genome, color="m", label='native genes')
sns.distplot(nTE_random, color="b", label='random ORFs')
plt.tight_layout()
plt.xlabel('nTE')
plt.ylabel('frequency density')
plt.legend()
plt.show()
########################################################################