-
Notifications
You must be signed in to change notification settings - Fork 55
/
Copy pathrandomseq.py
574 lines (517 loc) · 25.2 KB
/
randomseq.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
"""!
Random Amino Acid and Nucleotide Sequence Generator
Date created: 8th May 2018
License: GNU General Public License version 3 for academic or
not-for-profit use only
Bactome package is free software: you can redistribute it and/or
modify it under the terms of the GNU General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
"""
import random
import secrets
import subprocess
import sys
try:
import fire
except ImportError:
subprocess.check_call([sys.executable, "-m", "pip",
"install", "fire"])
import fire
class RandomSequence(object):
"""!
Class to generate random sequences based on input parameters.
Although the initial intention is to generate DNA/RNA sequences,
it can be used to generate other random sequences; such as,
amino acid sequences or even non-biological sequences.
Basic example:
>>> s = RandomSequence()
>>> selection = {"A":25, "U":25, "G":25, "C":25}
>>> s.initiateNucleotide(selection)
>>> s.generateSequence(50, True, True)
"AGAUCCCCCGGACAGGUUAUGGAUAAGUGUCCACCUCUAACUAUUUCUGC"
Example without start and stop:
>>> s = RandomSequence()
>>> selection = {"A":25, "T":25, "G":25, "C":25}
>>> s.initiateNucleotide(selection)
>>> s.initiateStart("TTG,CTG,ATG")
>>> s.initiateStop("TAA,TAG,TGA")
>>> s.generateSequence(50, False, False)
"AATCCGGACAAATCAAGCACAGGCAGTTTATTCAAGGGGTACCCCAGAAC"
"""
def __init__(self):
"""!
Constructor method.
"""
self.start_codons = []
self.stop_codons = []
self.sseq = []
def initiateNucleotide(self, ndict):
"""!
Initialize the object with a bag of items (atomic sequence),
defined as a dictionary, for random sequence generation. The
atomic sequences will be randomized.
@param ndict Dictionary: Represents a bag of items where the
key is the atomic sequence and the value is the relative
occurrence. For example, {"A":20, "T":20, "G":30, "C":30}
will initialize for the generation of a random sequence with
60% GC.
"""
for k in ndict:
self.sseq = self.sseq + [k] * int(ndict[k])
random.shuffle(self.sseq)
for i in range(100):
random.shuffle(self.sseq)
def initiateStart(self, start_codons="TTG,CTG,ATG"):
"""!
Initialize a list of start codons. This is used for two
purposes. Firstly, it can be used to check the randomly
generated sequence to ensure that there is no start codons
within the sequence. Secondly, it can be used to cap the
start of a newly generated sequence.
@param start_codons String: A comma-delimited list to
represent start codons. Default = "TTG,CTG,ATG".
"""
if type(start_codons) is str:
start_codons = start_codons.strip()
start_codons = [x.strip()
for x in start_codons.split(",")]
elif type(start_codons) is tuple or \
type(start_codons) is list:
start_codons = [x.strip() for x in start_codons]
self.start_codons = start_codons
def initiateStop(self, stop_codons="TAA,TAG,TGA"):
"""!
Initialize a list of stop codons. This is used for two
purposes. Firstly, it can be used to check the randomly
generated sequence to ensure that there is no stop codons
within the sequence. Secondly, it can be used to cap the
end of a newly generated sequence.
@param stop_codons String: A comma-delimited list to
represent start codons. Default = "TAA,TAG,TGA".
"""
if type(stop_codons) is str:
stop_codons = stop_codons.strip()
stop_codons = [x.strip() for x in stop_codons.split(",")]
elif type(stop_codons) is tuple or type(stop_codons) is list:
stop_codons = [x.strip() for x in stop_codons]
self.stop_codons = stop_codons
def _cleanStop(self, seq):
"""!
Private method - to remove all occurrences of stop codons
in a sequence.
@param seq String: Sequence to clean of all stop codons.
@return Cleaned sequence.
"""
for stop in self.stop_codons:
seq = seq.replace(stop, "")
return seq
def _cleanStart(self, seq):
"""!
Private method - to remove all occurrences of start codons
in a sequence.
@param seq String: Sequence to clean of all start codons.
@return Cleaned sequence.
"""
for start in self.start_codons:
seq = seq.replace(start, "")
return seq
def generateSequence(self, length, allow_start=False,
allow_stop=False):
"""!
Generate a random sequence.
@param length Integer: Length of random sequence to generate.
@param allow_start Boolean: Flag to allow occurrence(s) of
start codon(s) within the randomly generated sequence. Default
= False.
@param allow_stop Boolean: Flag to allow occurrence(s) of
stop codon(s) within the randomly generated sequence. Default
= False.
@return Randomly generated sequence of specified length.
"""
length = int(length)
sequence = ""
while len(sequence) < length:
sequence = sequence + secrets.choice(self.sseq)
if allow_start == "false" or not allow_start:
sequence = self._cleanStart(sequence)
if allow_stop == "false" or not allow_stop:
sequence = self._cleanStop(sequence)
return sequence[:length]
def selectionGenerator(selection):
"""!
Function to convert a string-based atomic sequence(s) definition
into a dictionary, suitable as parameter for
RandomSequence.initiateNucleotide() function.
@param selection String: Definition of the atomic sequence(s),
defined as a semi-colon delimited definition of comma-delimited
definition, used for random sequence generation.
"""
selection = str(selection)
selection = [pair.strip() for pair in selection.split(";")]
selection = [[pair.split(",")[0], pair.split(",")[1]]
for pair in selection]
selection = [[pair[0].strip(), pair[1].strip()]
for pair in selection]
ndict = {}
for k in selection:
ndict[str(k[0])] = int(k[1])
return ndict
def _initial_RandomSequence(start_codons, stop_codons,
selection, source_seq):
"""!
Private method - Instantiate a RandomSequence object based on
parameters. Main requirement is either selection or source_seq.
If both selection and source_seq are given (not empty) strings,
only source_seq will be used.
@param start_codons String: A comma-delimited list to represent
start codons. This is used for two purposes. Firstly, it can be
used to check the randomly generated sequence to ensure that there
is no start codons within the sequence. Secondly, it can be used
to cap the start of a newly generated sequence.
@param stop_codons String: A comma-delimited list to represent
stop codons. This is used for two purposes. Firstly, it can be
used to check the randomly generated sequence to ensure that there
is no stop codons within the sequence. Secondly, it can be used
to cap the stop of a newly generated sequence.Default =
"TAA,TAG,TGA".
@param selection String: Definition of the atomic sequence(s),
defined as a semi-colon delimited definition of comma-delimited
definition, used for random sequence generation.
@param source_seq String: A sequence to be used to generate the
atomic sequence(s), used for random sequence generation. The
atomic sequence(s) is/are essentially an single-character
alphanumeric of the sequence, where relative abundance of each
character is determined by the occurrence in the sequence.
@return RandomSequence object.
"""
o = RandomSequence()
o.initiateStart(start_codons)
o.initiateStop(stop_codons)
if source_seq == "":
ndict = selectionGenerator(selection)
else:
source_seq = [x for x in source_seq]
ndict = {}
for k in list(set(source_seq)):
ndict[k] = source_seq.count(k)
o.initiateNucleotide(ndict)
return o
def _generate_sequence(o, min_length, max_length,
allow_start, allow_stop,
cap_start, cap_stop):
"""!
Private method - Generates a fixed or variable length random
sequence based on given RandomSequence object. For fixed length
random sequence generation, min_length == max_length. If
min_length != max_length, variable length random sequence will
be generated with length uniformly distributed between min_length
and max_length.
@param o Object: RandomSequence object.
@param min_length Integer: Minimum length of randomly generated
sequence.
@param max_length Integer: Maximum length of randomly generated
sequence.
@param allow_start Boolean: Flag to allow occurrence(s) of
start codon(s) within the randomly generated sequence.
@param allow_stop Boolean: Flag to allow occurrence(s) of
stop codon(s) within the randomly generated sequence.
@param cap_start Boolean: Flag to determine whether to cap the
start of the randomly generated sequence with a start codon.
@param cap_stop Boolean: Flag to determine whether to cap the
end of the randomly generated sequence with a stop codon.
@return Generated random sequence.
"""
min_length = int(min_length)
max_length = int(max_length)
if min_length == max_length:
sequence = o.generateSequence(max_length, allow_start,
allow_stop)
else:
sequence = o.generateSequence(max_length, allow_start,
allow_stop)
length = min_length + \
int((max_length - min_length) * random.random())
sequence = sequence[:length+1]
if str(cap_start).lower() == "true" or cap_start == True:
sequence = secrets.choice(o.start_codons) + sequence
if str(cap_stop).lower() == "true" or cap_stop == True:
sequence = sequence + secrets.choice(o.stop_codons)
return sequence
def gFixedLength(length, n, allow_start=False, allow_stop=False,
start_codons="TTG,CTG,ATG", cap_start=False,
stop_codons="TAA,TAG,TGA", cap_stop=False,
selection="A,250;T,250;G,250;C,250",
source_seq="", fasta=True, prefix="Test"):
"""!
Function to generate one or more fixed length random sequences as
per specification.
Usage:
python randomseq.py FLS --length=100 --n=10 --allow_start=False --allow_stop=False --start_codons="TTG,CTG,ATG" --stop_codons="TAA,TAG,TGA" --cap_start=True --cap_stop=True --selection="A,250;T,250;G,250;C,250" --fasta=True --prefix="Test"
@param length Integer: Length of random sequence to generate.
@param n Integer: Number of random sequence(s) to generate.
@param allow_start Boolean: Flag to allow occurrence(s) of start
codon(s) within the randomly generated sequence. Default = False.
@param allow_stop Boolean: Flag to allow occurrence(s) of stop
codon(s) within the randomly generated sequence. Default = False.
@param start_codons String: A comma-delimited list to represent
start codons. This is used for two purposes. Firstly, it can be
used to check the randomly generated sequence to ensure that there
is no start codons within the sequence. Secondly, it can be used
to cap the start of a newly generated sequence.Default =
"TTG,CTG,ATG".
@param cap_start Boolean: Flag to determine whether to cap the
start of the randomly generated sequence with a start codon.
Default = False.
@param stop_codons String: A comma-delimited list to represent
stop codons. This is used for two purposes. Firstly, it can be
used to check the randomly generated sequence to ensure that there
is no stop codons within the sequence. Secondly, it can be used
to cap the stop of a newly generated sequence.Default =
"TAA,TAG,TGA".
@param cap_stop Boolean: Flag to determine whether to cap the
end of the randomly generated sequence with a stop codon.
Default = False.
@param selection String: Definition of the atomic sequence(s),
defined as a semi-colon delimited definition of comma-delimited
definition, used for random sequence generation. Default =
"A,250;T,250;G,250;C,250" which means that the list of atomic
sequences is defined as 250 "A", 250 "T", 250 "G", and 250 "C".
@param source_seq String: A sequence to be used to generate the
atomic sequence(s), used for random sequence generation. The
atomic sequence(s) is/are essentially an single-character
alphanumeric of the sequence, where relative abundance of each
character is determined by the occurrence in the sequence.
@param fasta Boolean: Flag to determine whether the output is to
be formatted as a FASTA file. Default = True.
@param prefix String: Prefix to the running number for the
sequence name, if the output is to be a FASTA format. Default =
"Test".
"""
o = _initial_RandomSequence(start_codons, stop_codons,
selection, source_seq)
for i in range(int(n)):
sequence = _generate_sequence(o, length, length, allow_start,
allow_stop, cap_start, cap_stop)
if fasta:
title = "_".join([str(prefix), str(i+1)])
print(">%s" % title)
print(sequence)
def gVariableLength(min_length, max_length, n,
allow_start=False, allow_stop=False,
start_codons="TTG,CTG,ATG", cap_start=False,
stop_codons="TAA,TAG,TGA", cap_stop=False,
selection="A,250;T,250;G,250;C,250",
source_seq="", fasta=True, prefix="Test"):
"""!
Function to generate one or more variable length random sequences
as per specification.
Usage:
python randomseq.py VLS --min_length=90 --max_length=110 --n=10 --allow_start=False --allow_stop=False --start_codons="TTG,CTG,ATG" --stop_codons="TAA,TAG,TGA" --cap_start=True --cap_stop=True --selection="A,250;T,250;G,250;C,250" --fasta=True --prefix="Test"
@param min_length Integer: Minimum length of randomly generated
sequence.
@param max_length Integer: Maximum length of randomly generated
sequence.
@param n Integer: Number of random sequence(s) to generate.
@param allow_start Boolean: Flag to allow occurrence(s) of start
codon(s) within the randomly generated sequence. Default = False.
@param allow_stop Boolean: Flag to allow occurrence(s) of stop
codon(s) within the randomly generated sequence. Default = False.
@param start_codons String: A comma-delimited list to represent
start codons. This is used for two purposes. Firstly, it can be
used to check the randomly generated sequence to ensure that there
is no start codons within the sequence. Secondly, it can be used
to cap the start of a newly generated sequence.Default =
"TTG,CTG,ATG".
@param cap_start Boolean: Flag to determine whether to cap the
start of the randomly generated sequence with a start codon.
Default = False.
@param stop_codons String: A comma-delimited list to represent
stop codons. This is used for two purposes. Firstly, it can be
used to check the randomly generated sequence to ensure that there
is no stop codons within the sequence. Secondly, it can be used
to cap the stop of a newly generated sequence.Default =
"TAA,TAG,TGA".
@param cap_stop Boolean: Flag to determine whether to cap the
end of the randomly generated sequence with a stop codon.
Default = False.
@param selection String: Definition of the atomic sequence(s),
defined as a semi-colon delimited definition of comma-delimited
definition, used for random sequence generation. Default =
"A,250;T,250;G,250;C,250" which means that the list of atomic
sequences is defined as 250 "A", 250 "T", 250 "G", and 250 "C".
@param source_seq String: A sequence to be used to generate the
atomic sequence(s), used for random sequence generation. The
atomic sequence(s) is/are essentially an single-character
alphanumeric of the sequence, where relative abundance of each
character is determined by the occurrence in the sequence.
@param fasta Boolean: Flag to determine whether the output is to
be formatted as a FASTA file. Default = True.
@param prefix String: Prefix to the running number for the
sequence name, if the output is to be a FASTA format. Default =
"Test".
"""
o = _initial_RandomSequence(start_codons, stop_codons,
selection, source_seq)
for i in range(int(n)):
sequence = _generate_sequence(o, min_length, max_length,
allow_start, allow_stop,
cap_start, cap_stop)
if fasta:
title = "_".join([str(prefix), str(i+1)])
print(">%s" % title)
print(sequence)
def shuffle(sequence):
"""!
Function to shuffle a given sequence.
Usage:
python randomseq.py shuffle --sequence="GTCCGTAGTCACTAGCTGACTAGTCGATCGATCGATGCTACGATGCATCAGTGCTAGCTGATGCTACGATCGATCGATGCTAGCA"
@param sequence String: Sequence to shuffle
"""
sequence = list(sequence)
random.shuffle(sequence)
return "".join(sequence)
def _process_mixed_dictionary(statement):
"""
Private method - Process the sequence specification statement
from gMixedSequences() function into a dictionary.
"""
statement = [s.strip() for s in statement.split(";")]
d = {}
for i in range(len(statement)):
d[i] = {"statement": statement[i]}
for k in d:
if d[k]["statement"].startswith("c("):
d[k]["type"] = "constant"
d[k]["sequence"] = d[k]["statement"][2:-1]
elif d[k]["statement"].startswith("v("):
d[k]["type"] = "variable"
options = d[k]["statement"][2:-1]
options = [x.strip() for x in options.split(",")]
options = [x.lower() for x in options]
d[k]["options"] = options
d[k]["sequence"] = None
elif d[k]["statement"].startswith("o("):
options = d[k]["statement"][2:-1]
if options.lower() == "start":
d[k]["type"] = "startcodon"
elif options.lower() == "stop":
d[k]["type"] = "stopcodon"
return d
def _generate_MDseq(o, md):
"""
Private method - Used by gMixedSequences() function to generate
the required random sequences.
"""
for k in md:
if md[k]["type"] == "variable":
min_length = md[k]["options"][0]
max_length = md[k]["options"][1]
allow_start = md[k]["options"][2]
allow_stop = md[k]["options"][3]
cap_start = md[k]["options"][4]
cap_stop = md[k]["options"][5]
s = _generate_sequence(o, min_length, max_length,
allow_start, allow_stop,
cap_start, cap_stop)
md[k]["sequence"] = s
elif md[k]["type"] == "startcodon":
md[k]["sequence"] = secrets.choice(o.start_codons)
elif md[k]["type"] == "stopcodon":
md[k]["sequence"] = secrets.choice(o.stop_codons)
keys = list(md.keys())
keys.sort()
sequence = [md[k]["sequence"] for k in keys]
return "".join(sequence)
def gMixedSequences(n, selection="A,250;T,250;G,250;C,250",
start_codons="TTG,CTG,ATG", cap_start=False,
stop_codons="TAA,TAG,TGA", cap_stop=False,
source_seq="", fasta=True, prefix="Test",
statement="v(10,15,False,False,False,False);c(gtccg);v(10,15,False,False,False,False)"):
"""!
Function to generate a mixed sequence, which can take many forms.
For example, this function can be used to generate a random
sequence of 30-40 nucleotides, followed by a 10-nucleotide
constant region, followed by a random region of exactly 20
nucleotides.
Usage:
python randomseq.py MS --n=10 --start_codons="TTG,CTG,ATG" --stop_codons="TAA,TAG,TGA" --cap_start=True --cap_stop=True --selection="A,250;T,250;G,250;C,250" --fasta=True --prefix="Test" --statement="v(10,15,False,False,False,False);o(start);c(gtccg);v(20,30,False,False,False,False);o(stop)""
The statement is a specification describing the random sequence to
generate. Each part of the sequence is specified in the format of
v(<min length>, <max length>, <allow start>, <allow stop>, <cap
start>, <cap stop>) or c(<constant sequence>). A random sequence
of 25-35 nucleotides, not allowing either start or stop codons
with the sequence but required capping the 5"-end of the sequence
with start codon is defined as v(25,35,False,False,True,False). A
constant region, 5"-GAATTC-3", is defined as c(GAATTC). Start
codon is defined as o(start) and stop codon is defined as o(stop).
Each part is then concatenated with semi-colon. Hence, the
definition, v(10,15,False,False,False,False);o(start);
c(gtccg);v(10,15,False,False,False,False);o(stop), will generate a
sequence of (1) a random region of 10 nucleotides, followed by (2)
another randomly selected start codon, followed by (3) a constant
region of gtccg, followed by (4) a random 20-30 nucleotide
sequence, followed by (5) another randomly selected stop codon.
The resulting sequence will be 10 to 15 (first variable region) +
3 (start codon) + 5 (constant region) + 20 to 30 (second variable
region) + 3 (stop codon) = 41 to 56 nucleotides long.
@param n Integer: Number of random sequence(s) to generate.
@param selection String: Definition of the atomic sequence(s),
defined as a semi-colon delimited definition of comma-delimited
definition, used for random sequence generation. Default =
"A,250;T,250;G,250;C,250" which means that the list of atomic
sequences is defined as 250 "A", 250 "T", 250 "G", and 250 "C".
@param start_codons String: A comma-delimited list to represent
start codons. This is used for two purposes. Firstly, it can be
used to check the randomly generated sequence to ensure that there
is no start codons within the sequence. Secondly, it can be used
to cap the start of a newly generated sequence.Default =
"TTG,CTG,ATG".
@param cap_start Boolean: Flag to determine whether to cap the
start of the randomly generated sequence with a start codon.
Default = False.
@param stop_codons String: A comma-delimited list to represent
stop codons. This is used for two purposes. Firstly, it can be
used to check the randomly generated sequence to ensure that there
is no stop codons within the sequence. Secondly, it can be used
to cap the stop of a newly generated sequence.Default =
"TAA,TAG,TGA".
@param cap_stop Boolean: Flag to determine whether to cap the
end of the randomly generated sequence with a stop codon.
Default = False.
@param source_seq String: A sequence to be used to generate the
atomic sequence(s), used for random sequence generation. The
atomic sequence(s) is/are essentially an single-character
alphanumeric of the sequence, where relative abundance of each
character is determined by the occurrence in the sequence.
@param fasta Boolean: Flag to determine whether the output is to
be formatted as a FASTA file. Default = True.
@param prefix String: Prefix to the running number for the
sequence name, if the output is to be a FASTA format. Default =
"Test".
@param statement String: Specification statement of the random
sequence to generate. Please see above description. Default =
"v(10,15,False,False,False,False);c(gtccg);
v(10,15,False,False,False,False)"
"""
o = _initial_RandomSequence(start_codons, stop_codons,
selection, source_seq)
md = _process_mixed_dictionary(statement)
for i in range(int(n)):
sequence = _generate_MDseq(o, md)
if fasta:
title = "_".join([str(prefix), str(i+1)])
print(">%s" % title)
print(sequence)
if __name__ == "__main__":
exposed_functions = {"FLS": gFixedLength,
"VLS": gVariableLength,
"MS": gMixedSequences,
"shuffle": shuffle}
fire.Fire(exposed_functions)