-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathcytomod.py
executable file
·861 lines (780 loc) · 44 KB
/
cytomod.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
#!/usr/bin/env python
from __future__ import with_statement, division, print_function
"""Cytomod uses information on modification
locations to replace the appropriate symbols in a reference genome sequence
with the new symbols. Cytomod can incorporate data from both
single-base assays (point annotations in BED format)
and lower-resolution assays (region annotations in BED or WIG formats).
The output from this program is intended to allow for de novo
discovery of modification-sensitive motifs in known TFBSs.
This program currently handles the following modifications
to the cytosine nucleobase: {5mC, 5hmC, 5fC, 5caC}.
"""
import argparse
import glob
import gzip
import math
import os
import random
import re
import warnings
from collections import OrderedDict
import numpy as np
import cytoUtils as cUtils
from cytoUtils import duplicates, indices
__version__ = cUtils.__version__
_AUTOSOME_ONLY_FLAG = 'u'
_ALLOSOME_ONLY_FLAG = 'l'
_MITOCHONDRIAL_ONLY_FLAG = 'm'
_MITOCHONDRIAL_INCLUSION_FLAG = _MITOCHONDRIAL_ONLY_FLAG.upper()
SUPPORTED_FORMATS_REGEX = '\.(bed|wig|bed[gG]raph)(.gz)?$'
CHROMOSOME_TYPE_REGEXES = {_AUTOSOME_ONLY_FLAG: 'chr\d+',
_ALLOSOME_ONLY_FLAG: 'chr[XY]',
_MITOCHONDRIAL_ONLY_FLAG: 'chrM',
_MITOCHONDRIAL_INCLUSION_FLAG: 'chr(?:\d+|[XYM])'}
# default chromosomal exclusions include: unmapped data, haplotypes, and chrM
CHROMOSOME_EXCLUSION_REGEX = '(?:random|hap|chrM)'
MOD_BASE_REGEX = '5(m|hm|f|ca)C'
REGION_REGEX = '(chr(?:\d+|[XYM]))(?::(?P<start>\d+)?-(?P<end>\d+)?)?'
_DEFAULT_ARCHIVE_NAME = 'archive'
_DEFAULT_FASTA_FILENAME = 'modGenome.fa'
# bases ordered by least frequent in our dataset with most ambiguous bases last
_DEFAULT_BASE_PRIORITY = 'fhmc' + ''.join(cUtils.AMBIG_MOD_BASES.keys()[::-1])
_DEFAULT_CENTRED_REGION_LENGTH = 500
_DEFAULT_BASE_PRIORITY_COMMENT = """the resolution of the biological protocol
(i.e. single-base > any chemical > any DIP)"""
_DEFAULT_RAN_LENGTH = 2000
_DEFAULT_MASK_VALUE = 0
_MAX_REGION_LEN = 2000000
_MAX_CONTIG_ATTEMPTS = 3
# tracks to be displayed densely for output UCSC browser tracks
_DENSE_TRACKS = 'ruler ensGene pubs cpgIslandExt oreganno rmsk snp128'
_MASK_TNAME = 'MASK'
def die(msg):
cUtils.die(msg, os.path.basename(__file__))
def warn(msg):
cUtils.warn(msg, os.path.basename(__file__))
def v_print_timestamp(verbosity, msg="", threshold=1):
cUtils.v_print_timestamp(verbosity, msg, threshold,
os.path.basename(__file__))
def _modifychrmExclusionRegex(additionalchrmExclusionFlag):
"""Modify the chromosome exclusion regex,
accroding to the provided flags."""
global CHROMOSOME_EXCLUSION_REGEX # allow modification of global var
# Modify the exclusion regex by adding the regex corresponding
# to the flag that we wish to exclude. However, the dictionary
# containing the regexes identify the group specified by the flag
# (i.e. are inclusion regexes). We therefore invert the additional
# regex via a modified anchored negative lookahead.
CHROMOSOME_EXCLUSION_REGEX += '|(^((?!' + \
CHROMOSOME_TYPE_REGEXES[additionalchrmExclusionFlag] + ').)*$)'
def _ensureRegionValidity(genome, chrm, start, end):
"""Ensures the validity of the given region. Dies if not valid."""
chromosome = genome[chrm]
try:
chromosome = genome[chrm]
except:
die("Invalid region: invalid chrmomosme.")
if (chromosome.start < 0) or (chromosome.start >= end):
die("Invalid region: invalid start position.")
if (end <= start) or (end > chromosome.end):
die("Invalid region: invalid end position.")
def _maybeGetAmbigMapping(base, ambigMap):
"""Maps the given base to its ambiguous base, if possible."""
return (ambigMap.get(base) or
(cUtils.complement(ambigMap.get(cUtils.complement(base)))
if ambigMap.get(cUtils.complement(base)) else None) or base)
def getTrackHeader(modBase):
"""Generates and returns a valid UCSC track header,
with an appropriate name, description, and colour
for the the given modified nucleobase.
The returned header is terminated by a newline.
"""
colour = str(cUtils.getRGB256BaseCol(modBase)).translate(None, '() ')
browserConfLines = "browser hide all\nbrowser dense " + \
_DENSE_TRACKS + "\n"
return browserConfLines + 'track name="Nucleobase ' + modBase + \
'" description="Track denoting ' + cUtils.FULL_MOD_BASE_NAMES[modBase] + \
' covalently modified nucleobases.' + '" color=' + \
re.sub('[() ]', '', colour + "\n")
def getModifiedGenome(genome, modOrder, chrm, start, end,
suppressFASTA, suppressBED, tnames, ambigMap,
maskRegionsFileVal, maskRegionTName,
maskAllUnsetRegions):
"""Returns the modified genome sequence, for the given genome,
over the given input region.
"""
hasModifiedBases = False
chromosome = genome[chrm]
allbasesResult = ""
# Only compute the modified genome in segments.
# This prevents the creation of excessively large NumPy arrays.
for s in range(start, end, _MAX_REGION_LEN):
e = s + _MAX_REGION_LEN if (end - start) >= _MAX_REGION_LEN else end
v_print_timestamp(args.verbose, "Now outputting " + chrm +
" for region: (" + str(s) + ", " + str(e) + ")", 2)
modBaseScores = chromosome[s:e]
if _MASK_TNAME in tnames:
maskTrack = chromosome[s:e, maskRegionTName]
maskIndex = genome.tracknames_continuous.index(maskRegionTName)
modBaseScores[:, maskIndex] = \
np.where(np.logical_or(np.isnan(maskTrack),
maskTrack > maskRegionsFileVal),
np.zeros(maskTrack.shape[0]),
np.ones(maskTrack.shape[0]))
else: # may still need to remove mask track
# TODO this could be made more efficient
for item in genome.tracknames_continuous:
if item.find(_MASK_TNAME) >= 0:
maskIndex = genome.tracknames_continuous.index(item)
modBaseScores = np.delete(modBaseScores, maskIndex, axis=1)
if len(modOrder) != len(set(modOrder)): # intersect, if duplicates
v_print_timestamp(args.verbose, "Intersection is enabled.")
# Convert 0s to nan s.t. below mean works correctly, as an
# intersection. Has no other impact, so fine to leave this way.
modBaseScores[modBaseScores == 0] = np.nan
for base_order, idxs in indices(modOrder,
duplicates(modOrder)).iteritems():
v_print_timestamp(args.verbose, "Intersecting for base '{}'."
.format([base for base in args.priority
if base in modBases][base_order]),
3)
#
# NB: mean is not really needed, could be more efficient w/0|1;
# use of mean is, however, more general, permitting the
# use of other non-binary values, although we do not
# currently make particular use of this.
#
# key is to make any NaNs remain NaN, and leave others finite
modBaseScores[:, idxs] = np.mean(modBaseScores[:, idxs],
axis=1, keepdims=True)
# use '0' for unmodified bases (results in the unmod. base after)
defaultBases = '0'
# if masking all unset regions, use mask base for those, and '0' o/w
if maskAllUnsetRegions:
defaultBases = np.where(np.all(np.isnan(modBaseScores), axis=1),
cUtils.MASK_BASE, '0')
defaultBases = np.tile(defaultBases, (len(modBases), 1)).T
modBasesA = np.where(np.logical_and(np.isfinite(modBaseScores),
modBaseScores != 0), modBases,
# either the modified base or the mask base
# any masking applied here is only for masking
# bases without any data
defaultBases)
modOrderBasedPermutation = np.array(modOrder).argsort()
orderedmodBasesA = modBasesA[:, modOrderBasedPermutation]
referenceSeq = np.array(list(chromosome.seq[s:e].
tostring().upper()), dtype=np.str)
# Filter the bases to take the modified bases in priority order.
x = np.transpose(np.nonzero(orderedmodBasesA != '0'))
u, idx = np.unique(x[:, 0], return_index=True)
def maybeGetModBase(m, r, ambigMap):
"""Returns the modified base corresponding to
the given putatively modified base (m). The base returned
is the input putatively modified base if the corresponding
reference base (r) is modifiable to the input base, or the
complement of that base, if the complemented reference is
modifiable to it, otherwise the reference base (r) itself
is returned. This function also maps modified bases to any
applicable ambiguity codes that are provided in ambigMap.
"""
m = ambigMap.get(m) or m # maybe map to an ambiguous base
if m not in cUtils.MOD_MAP:
return ambigMap.get(r) or r
else:
if cUtils.MOD_MAP[m] == r:
return m
elif cUtils.MOD_MAP[m] == cUtils.complement(r)[0]:
return cUtils.complement(m)[0]
else:
return ambigMap.get(r) or r
# Initially the sequence is unmodified and we successively modify it.
allModBases = np.copy(referenceSeq)
# We vectorize the function for convenience.
# NumPy vectorized functions still execute the Python code at
maybeGetModBase = np.vectorize(maybeGetModBase)
# Mask the sequence, allowing only base modifications
# that modify their 'target' base (i.e. '5fC' = 'f' only modifies 'C').
# Return the reference base for all non-modifiable bases
# and for unmodified bases.
if x.size > 0:
hasModifiedBases = True
# Modify bases
np.put(allModBases, x[idx][:, 0],
maybeGetModBase(orderedmodBasesA[x[idx][:, 0],
x[idx][:, 1]],
allModBases[x[idx][:, 0]], ambigMap))
if ambigMap: # Replace with ambiguous bases in unmodified sequence
unmodBaseIndices = np.delete(np.indices(np.shape(allModBases)),
x[idx][:, 0])
np.put(allModBases, unmodBaseIndices, np.vectorize(lambda base:
_maybeGetAmbigMapping(base, ambigMap))
(np.delete(allModBases, x[idx][:, 0])))
if not suppressBED:
# Create a BED track for each modified base with track data
# Use unique modified bases only, since we may otherwise obtain
# redundant track lines.
for base in set(modBases + cUtils.complement(modBases)):
# NB: This could be done in a more efficient manner.
baseModIdxs = np.flatnonzero(allModBases[x[idx][:, 0]]
== base)
if baseModIdxs.size > 0:
# Get the position of the modified bases in the
# sequence, adding the genome start coordinate of
# the sequence to operate in actual genome coordinates.
modBaseCoords = x[idx][:, 0][baseModIdxs] + s
modBaseStartEnd = np.column_stack((modBaseCoords,
modBaseCoords+1))
# Save the track, appending to a Gzipped BED file.
# TODO save as string buffer (using list joins)
# and gzip after (with cStringIO).
# That will allow actual compression to occur.
with gzip.open(tnames[base],
'ab') as BEDTrack:
np.savetxt(BEDTrack, modBaseStartEnd,
str(chrm) + "\t%d\t%d\t" + base)
if not suppressFASTA:
# Output the unmodified sequence at a verbosity level
# of at least 2, if not too long, otherwise only output
# for a high verbosity level.
v_print_timestamp(args.verbose, """Corresponding unmodified
reference sequence: \n""" +
''.join(referenceSeq), 2
if len(referenceSeq) < 10000 else 6)
# Concatenate the vector together to form the (string) sequence
allbasesResult += ''.join(allModBases)
if (not hasModifiedBases and not suppressBED):
warn(""""There are no modified bases within the requested
region. Accordingly, no BED files have been output
for this region.""")
return allbasesResult
def generateFASTAFile(file, id, genome, modOrder, chrm, start,
end, suppressBED, tnames, ambigMap, maskRegionsFileVal,
maskRegionTName, maskAllUnsetRegions):
"""Writes an optionally Gzipped FASTA file of the modified genome
appending to the given file, using the given ID.
No FASTA ID (i.e. '> ...') is written if no ID is given.
"""
# Write either a Gzipped file or not, by using the appropriate function
with cUtils.maybe_gzip_open(file, 'ab') as modGenomeFile:
if id:
modGenomeFile.write(">" + id + "\n")
modGenomeFile.write(getModifiedGenome(genome, modOrder, chrm,
start, end, False, suppressBED, tnames, ambigMap,
maskRegionsFileVal, maskRegionTName,
maskAllUnsetRegions) + "\n")
def selectRandomRegion(genome, length):
"""Selects a random, non-exluded, chromosome of sufficient length.
This method attempts to ensure that the region selected is
wholly within a supercontig.
"""
selectablechromosomes = {chromosome.name: chromosome.end for
chromosome in genome if
(chromosome.end >= length and
not re.search(CHROMOSOME_EXCLUSION_REGEX,
chromosome.name))}
if not selectablechromosomes:
die("""The region length provided is too long or all
chromosomes have been excluded.""")
chrm = random.choice(selectablechromosomes.keys())
contigAttempts = 0
while True:
start = random.randint(genome[chrm].start, genome[chrm].end)
end = start + length
if genome[chrm].supercontigs[start:end]:
break
elif contigAttempts >= _MAX_CONTIG_ATTEMPTS:
warn("""Attempts to procure sequence from a supercontig
were exhausted. Returning sequence that is not
wholly contained within a supercontig.""")
break
contigAttempts += 1
return chrm, start, end
def parseRegion(genome, region):
"""Parses the provided region, ensuring its validity."""
region = re.sub('[, ]', '', region) # remove unwanted characters
regionMatch = re.search(REGION_REGEX, region)
if not regionMatch:
die("Invalid region: invalid format.")
chrm = regionMatch.group(1)
start = 0
end = 1
if regionMatch.group('start'):
start = int(regionMatch.group('start'))
else:
start = genome[chrm].start
if regionMatch.group('end'):
end = int(regionMatch.group('end'))
else:
end = genome[chrm].end
_ensureRegionValidity(genome, chrm, start, end)
return chrm, start, end
def determineTrackPriority(genome):
"""Currently, an ad hoc and contrived means of determining
which epigenetic modification has precedence. This is done by
naively considering the resolution, and secondarily, the frequency
of a given type of modification.
NB: This method is not yet fully implemented.
"""
# TODO cannot complete this without being able to
# access the intervals over which the track data is defined.
# Genomedata does not appear to support this.
# TODO Ameliorate this.
# print(genome.num_datapoints)
# randomly sample 1000 bases of first chromosome to determine resolution
for chromosome in genome:
testRegion = random.randint(chromosome.start, chromosome.end)
print(chromosome[testRegion:testRegion + 1000])
break
# TODO add a custom action to parse all directories, which ensures
# that each directory contains a trailing slash.
parser = argparse.ArgumentParser()
genomeArchive = parser.add_mutually_exclusive_group(required=True)
genomeArchive.add_argument('-G', '--genomeDataArchiveFullname',
help="The genome data archive. \
It must contain all needed \
sequence and track files. \
If one is not yet created, \
use '-d' instead to create it.")
genomeArchive.add_argument("-d", "--archiveCompDirs", nargs=2,
metavar=('GENOME_DIR', 'TRACKS_DIR'),
help="Two arguments first specifying the directory \
containing the genome and then the directory \
containing all modified base tracks. The genome \
directory must contain (optionally Gzipped) \
FASTA files of chromosomes and/or scaffolds. \
The sequence files must end in either a \
\".fa\" or \".mask\" extension \
(with an optional \".gz\" suffix). \
The track directory must contain \
(optionally Gzipped) genome tracks. \
They must have an extension describing \
their format. We currently support: \
\".wig\", \".bed\", and \".bedGraph\". \
Provided BED files must have exactly \
four columns. The fourth column must be numeric. \
Any rows with a data value of zero will be \
ignored (this does not apply to masking; \
see '-M'), except that such positions will be \
considered as having evidence against that \
particular modification (e.g. will be considered \
as 'set' and will therefore not be masked by \
'--maskAllUnsetRegions'). \
The filename of each track must \
specify what modified nucleobase it pertains to; \
one of: {}. \
Track names can also be of ambiguity codes \
(e.g. \"5xC\") on the positive strand only. \
Such tracks directly specify ambiguous loci. \
If multiple tracks of the same type are \
provided, all such tracks will be added to the \
archive. The output sequence will default to the \
union of all of the same modification type \
(but see '-I'). Alternatively, the track name \
can contain {}, in which case masking can be used \
via '-M' (refer to that option for details). \
Instead of a track directory, a single filename \
that meets the aforementioned requirements may \
be provided if the archive is to contain only \
one track. \
Ensure that all tracks are mapped to the same \
assembly and that this assembly matches the \
genome provided. This will \
create a genome data archive in an \"archive\" \
sub-directory of the provided track directory. \
Use '-G' instead to use an existing \
archive.".format(cUtils.MOD_BASES.keys(),
_MASK_TNAME))
parser.add_argument("--archiveOutDir",
help="Only applicable if '-d' is used. \
The directory in which to save the created \
genome data archive. If not specified, this \
defaults to the directory containing the tracks \
(i.e. the second argument provided to '-d'). \
This defaults to {}.".format(_DEFAULT_ARCHIVE_NAME))
parser.add_argument("--archiveOutName", default=_DEFAULT_ARCHIVE_NAME,
help="Only applicable if '-d' is used. \
The name of the archive (i.e. the name \
of the directory which comprises the genome data \
archive).")
region = parser.add_mutually_exclusive_group()
region.add_argument('-r', '--region', help="Only output the modified genome \
for the given region. This can either be via a file \
or a region specification string. In the latter case, \
the region must be specified in the format: \
chrm<ID>:<start>-<end> (ex. chr1:500-510). \
If a file is being provided, it can be in \
any BEDTools-supported file format \
(BED, VCF, GFF, and Gzipped versions thereof). \
The full path to the file should be provided \
(or just the file name for the current directory).")
parser.add_argument('-c', '--centeredRegion', nargs='?', type=int,
const=_DEFAULT_CENTRED_REGION_LENGTH,
help="If used in conjunction with '-r', \
only output the modified genome \
for the given base pair interval (defaults to 500 bp), \
centered around the given region. \
NB: This region does not necessarily correspond to the \
centre of the peak, since the region's start and end \
coordinates alone are used to find the centre, \
as opposed to any peak information \
(from a narrowPeaks file, for example).")
region.add_argument('-R', '--randomRegion', nargs='?',
const=_DEFAULT_RAN_LENGTH, type=int,
help="Output the modified genome for a random region. \
The chrmomsome will be randomly selected and its \
coordinate space will be randomly and uniformly sampled. \
A length for the random region can either be specified \
or it will otherwise be set to a reasonably \
small default. The length chosen may constrain the \
selection of a chromosome.")
parser.add_argument('-A', '--alterIncludedChromosomes',
choices=CHROMOSOME_TYPE_REGEXES.keys(),
help="Include or exclude chromosome \
types. '{}': Use only autosomal chromosomes \
(excludes chrM). \
'{}': Use only allosomal chromosomes (excludes chrM). \
'{}': Use only the mitochondrial chromosome. \
'{}': Include the mitochondrial chromosome. \
NB: default chromosomal exclusions include: \
unmapped data, haplotypes, and chrM. \
This parameter will be ignored if a \
specific genomic region is queried via '-r', \
but will be considered if a file of genomic \
regions is provided (also via '-r'). \
".format(_AUTOSOME_ONLY_FLAG,
_ALLOSOME_ONLY_FLAG,
_MITOCHONDRIAL_ONLY_FLAG,
_MITOCHONDRIAL_INCLUSION_FLAG))
parser.add_argument('-p', '--priority', default=_DEFAULT_BASE_PRIORITY,
help="Specify the priority \
of modified bases. The default is:"
+ _DEFAULT_BASE_PRIORITY + ", which is based upon "
+ _DEFAULT_BASE_PRIORITY_COMMENT + ".")
BEDGeneration = parser.add_mutually_exclusive_group()
BEDGeneration.add_argument('-b', '--suppressBED', action='store_true',
help="Do not generate any BED tracks.")
BEDGeneration.add_argument('-B', '--onlyBED', action='store_true',
help="Only generate any BED tracks \
(i.e. do not output any sequence information). \
Note that generated BED files are always \
appended to and created in the CWD \
irrespective of the use of this option. \
This parameter is ignored if '-f' is used.")
parser.add_argument("--BEDOutDir", default='.',
help="Only applicable if '-b' is not used. \
The directory in which to save the created \
BED tracks. If not specified, this \
defaults to the current working directory.")
parser.add_argument('-f', '--fastaFile', nargs='?', type=str,
const=_DEFAULT_FASTA_FILENAME, help="Output to \
a file instead of STDOUT. Provide a full path \
to a file to append the modified genome in \
FASTA format. If this parameter is invoked \
without any arguments, a default filename \
will be used within the current directory. \
This will override the '-B' parameter \
(i.e. a FASTA file with always be produced). \
The output file will be Gzipped iff the \
path provided ends in \".gz\".")
parser.add_argument('-I', '--intersection', action='store_true',
help="If multiple files of the same modification \
type are given, take their intersection. \
This option is used to override the default, \
which is to take their union.")
ambigModUsage = \
parser.add_argument_group(title="Ambiguous Modification",
description="Specify that some of the data \
provided for a given modified base is unable \
to differentiate between some number of \
modifications. This ensures that Cytomod \
outputs the correct ambiguity code such that \
modified genomes do not purport to convey \
greater information than they truly contain. \
The most general applicable ambiguities should \
be specified. Therefore, each modified \
nucleobase may reside in at most one specified \
set of ambiguities.")
ambigModUsage.add_argument('--mh', action='store_const', const='mh',
help="Specify that input data is not able to \
differentiate between 5mC and 5hmC. This would \
be the case if the data originated from a protocol \
which only included conventional bisulfite \
sequencing.", default='')
ambigModUsage.add_argument('--fC', action='store_const', const='fC',
help="Specify that input data is not able to \
differentiate between 5fC and C. This would be the \
case if the data originated from a protocol which \
only included oxidative bisulfite sequencing.",
default='')
ambigModUsage.add_argument('--fc', action='store_const', const='fc',
help="Specify that input data is not able to \
differentiate between 5fC and 5caC. This would be \
the case if the data originated from a protocol \
which only included M.SssI methylase-assisted \
bisulfite sequencing.",
default='')
ambigModUsage.add_argument('-M', '--maskRegions', nargs='?', type=float,
const=_DEFAULT_MASK_VALUE,
help="Hard mask C/G nucleobases to unknown state. \
Assumes that the archive contains or is \
being built with a trackname containing \"" +
_MASK_TNAME + "\". \
The containing loci will be interpreted as \
nucleobases of unknown modification state. \
They will be accordingly set to the appropriate \
(maximally) ambiguous base. This will override any \
other modifications at those loci. \
This parameter can accept an optional argument, \
indicating a value at and below which the locus \
is considered ambiguous. If not provided, \
this defaults to " + str(_DEFAULT_MASK_VALUE) + ". \
An example use case for this option would be to \
use a mask file containing coverage information \
and to mask all bases of insufficient coverage.")
ambigModUsage.add_argument('--maskAllUnsetRegions', action='store_true',
help="Hard mask all C/G nucleobases without \
any modification information to unknown state. \
Masked nucleobases are those lacking data, \
that is, bases not present in the archive \
nor in any files used to generate an archive. \
Set, but unmodified, bases can be provided, \
within any modified genomic interval file with \
a value of 0. See '-d' for further details.\
Unset modifiable bases will be accordingly set to \
the appropriate (maximally) ambiguous base. \
This will override any other modifications \
at those loci. An example use case for this option \
would be when using array data, for which \
only a subset of bases are queried.")
# TODO Implement this?
# NB: neither TRF nor dustmasker work upon modified genomes
# parser.add_argument('-M', '--hardMaskRepetitiveRegions',
# help="Hard mask low complexity regions. The program \
# first will attempt to use the Tandem Repeat Finder \
# (TRF) program, by Gary Benson, using a system call \
# to the 'trf' executable. If this is not available, \
# the program will attempt to use dustmasker, of the NCBI \
# toolkit, by Morgulis et al (executed via 'dustmasker'). \
# By default, a threshold of 16 is used with dustmasker, \
# incrasing its masking, as done by Frith et al. 2010. \
# An argument containing the first letter of a tool \
# may be passed to override this behaviour. \
# Additionally, passing 'B' or 'F' will use TRF \
# with parameters recommended by the TRF author \
# or from Frith et al., respectively. \
# Default behaviour is equivalent to providing the \
# 'F' sub-argument.", action='store_const', const='F')
parser.add_argument('-v', '--verbose', help="increase output verbosity",
action="count")
parser.add_argument('-V', '--version', action='version',
version="%(prog)s " + __version__)
args = parser.parse_args()
if (args.region and (not os.path.isfile(args.region))
and args.alterIncludedChromosomes):
# '-A' is ignored iff '-r' is provided and is not a path to a file
warn("""Chromosome inclusion or exclusion setting was ignored,
since a single specific region was specifed.""")
if args.centeredRegion and not args.region:
warn("""Centered region argument ignored, since no specific regions
were specifed. Specify '-r' with '-c' to use centred regions.""")
if args.onlyBED and args.fastaFile:
warn("""Request to only generate BED files ignored,
since an output FASTA path was provided.""")
if not args.archiveCompDirs and (args.archiveOutDir or args.archiveOutName !=
_DEFAULT_ARCHIVE_NAME):
warn("""Neither the output directory nor the output name of a
genome archive are applicable, because an archive
is not being created.""")
if args.suppressBED and args.BEDOutDir:
warn("""The directory provided for BED output has been ignored, since
BED output has been suppressed.""")
if args.alterIncludedChromosomes:
_modifychrmExclusionRegex(args.alterIncludedChromosomes)
# NB: Ensure to update this to include all arguments in the ambigModUsage group
# TODO It would be nice if there was an automated means of accomplishing this
# Ensure provided ambiguities are unique
if (''.join(OrderedDict.fromkeys(args.mh + args.fC + args.fc).keys()) !=
args.mh + args.fC + args.fc):
die("Provided ambiguity codes must be unique.")
# Create a map from ambiguity codes to the specified ambiguities
ambigMap = {}
for k in [args.fC, args.mh, args.fc]:
ambigMap.update({b: cUtils.INVERTED_AMBIG_MOD_BASES[k]
for b in ''.join(k)})
v_print_timestamp(args.verbose, "Using the following ambiguity map: " +
str(ambigMap) + ".", 2)
from genomedata import Genome, load_genomedata
genomeDataArchiveFullname = ""
if args.archiveCompDirs:
# Create the archive in the directory of the track files
# unless another directory is specified
defaultArchiveDir = os.path.dirname(args.archiveCompDirs[1]) if \
os.path.isfile(args.archiveCompDirs[1]) else args.archiveCompDirs[1]
genomeDataArchiveFullname = (args.archiveOutDir or defaultArchiveDir) + \
'/' + args.archiveOutName + '/'
v_print_timestamp(args.verbose, "Creating genomedata archive in {}.".
format(genomeDataArchiveFullname))
# Create the genome data archive
# Load all supported track files in the tracks directory
# Load all FASTA files in the sequences directory
# TODO this could be generalized and could be made less redundant
FASTA_file_list = glob.glob(args.archiveCompDirs[0] + "/*.fa") or \
glob.glob(args.archiveCompDirs[0] + "/*.fa.gz")
if not FASTA_file_list:
FASTA_file_list = glob.glob(args.archiveCompDirs[0] + "/*.mask") or \
glob.glob(args.archiveCompDirs[0] + "/*.mask.gz")
v_print_timestamp(args.verbose, """Detected that a repeat masked
genome is being used for archive creation.""")
if not FASTA_file_list:
die("Unable to locate FASTA input files for archive creation.")
load_genomedata.load_genomedata(
genomeDataArchiveFullname,
# args.archiveCompDirs[1] is either a directory of tracks
# or is the full path to a single track.
# TODO check for a valid track path, that actually
# contains tracks, otherwise throw an error.
# NB: the tracks parameter must be a *list of pair(s)*,
# specifying: (track name, track/directory path).
tracks=([(os.path.basename(args.archiveCompDirs[1]),
args.archiveCompDirs[1])]
if os.path.isfile(args.archiveCompDirs[1]) and
re.search(SUPPORTED_FORMATS_REGEX, args.archiveCompDirs[1])
else [(track, args.archiveCompDirs[1] + track)
for track in os.listdir(args.archiveCompDirs[1])
if re.search(SUPPORTED_FORMATS_REGEX, track)]),
# args.archiveCompDirs[0] contains the directory with all FASTAs
seqfilenames=FASTA_file_list, verbose=args.verbose)
else:
v_print_timestamp(args.verbose, "Using existing genomedata archive.")
genomeDataArchiveFullname = args.genomeDataArchiveFullname
with Genome(genomeDataArchiveFullname) as genome:
warnings.simplefilter("ignore") # Ignore supercontig warnings
v_print_timestamp(args.verbose, "Genomedata archive successfully loaded.")
maskRegionTName = ''
modBases = []
modOrder = []
tnames = {}
for track in genome.tracknames_continuous:
if _MASK_TNAME in str(track):
if args.maskRegions is not None:
modBases.append(cUtils.MASK_BASE)
tnames[_MASK_TNAME] = track
maskRegionTName = track
else:
warn("""Genomedata archive contains a mask track, but
'-M' was not given.
Masking will not be performed.""")
else:
trackToBase = {covalent_mod_base for
covalent_mod, covalent_mod_base in
cUtils.COVALENT_MOD_BASES_TO_BASE_MAP.iteritems()
if covalent_mod in track}.pop()
if trackToBase: # add a regular or ambigous modified base track
modBases.append(trackToBase)
else:
warn("Unrecognized track " + track + " has been ignored.")
if args.maskRegions is not None and _MASK_TNAME not in tnames:
die("""Masking of genome regions requires the generation of a
Genomedata archive containing a mask track.""")
# For modOrder, lowest numbers have higher priority (i.e. 0 is highest).
for i, base in enumerate(modBases): # get rel. ordering
modOrder += [(args.priority.index(base) if args.intersection
# in else, no int., so ensure unique orders
#
# take fourth power to ensure priority always
# outweighs index and overcomes similar priorities
# to ensure unique orders
else i + (args.priority.index(base) + 1)**4)]
# Get absolute (index-based) ordering from the relative ordering.
modOrder = [sorted(modOrder).index(x) for x in modOrder]
addtl_mask_msg = ""
if args.maskRegions is not None:
addtl_mask_msg = """All loci implicated by the mask will be masked
irrespective of any mods at those loci."""
# Masked bases are assigned the highest priority (mask all others).
modOrder = [order + 1 for order in modOrder]
modOrder[modBases.index(cUtils.MASK_BASE)] = 0
elif args.maskAllUnsetRegions:
addtl_mask_msg = """All loci with missing data will be masked."""
# No priority assigned, since this masking is performed separately.
v_print_timestamp(args.verbose,
"Masking is enabled. {}".format(addtl_mask_msg), 2)
v_print_timestamp(args.verbose, """The order of preference for base
modifications (from highest to lowest) is: """ +
','.join(list(args.priority)) + ".")
# Before computing modified bases in blocks, remove any existing BED files
# and write the tracks' headers.
# Also, store the tracks' names, keyed by modified base, for future use.
if not args.suppressBED:
trackID = os.path.splitext(os.path.basename(args.fastaFile
or _DEFAULT_FASTA_FILENAME))[0]
trackID += '-' if trackID else ''
procModBases = []
for base in modBases + cUtils.complement(modBases):
trackFileName = "{}/track-{}{}.bed.gz".\
format(args.BEDOutDir, trackID, base)
tnames[base] = trackFileName
# 'EAFP' way of removing any existing old tracks
try:
os.remove(trackFileName)
except OSError:
pass
with gzip.open(trackFileName, 'ab') as BEDTrack:
BEDTrack.write(getTrackHeader(base))
if args.region or args.randomRegion:
if args.randomRegion: # Random region
chrms, starts, ends = selectRandomRegion(genome, args.randomRegion)
regions = np.matrix([chrms, starts, ends])
elif os.path.isfile(args.region): # 'BED-like' set of regions
import pybedtools
# NB: In pybedtools, all regions behave as if they are 0-based,
# despite non-BED files being 1-based. Thus, we do not need to
# alter any code in our program.
BEDTool = pybedtools.BedTool(args.region)
# generate regions, only including the appropriate chromosomes
regions = np.matrix([[interval['chrom'], interval['start'],
interval['end']] for interval in BEDTool
if not re.search(CHROMOSOME_EXCLUSION_REGEX,
interval['chrom'])])
else: # A single, specific, region ('genome browser-like')
chrms, starts, ends = parseRegion(genome, args.region)
regions = np.matrix([chrms, starts, ends])
for i in xrange(0, len(regions.flat), 3):
chrm, start, end = regions.flat[i], int(regions.flat[i + 1]), \
int(regions.flat[i + 2])
if args.centeredRegion:
centre = int(math.floor((start + end) / 2))
start = int(centre - args.centeredRegion / 2)
if start < 0:
start = 0
end = int(centre + args.centeredRegion / 2)
if end > int(genome[chrm].end):
end = int(genome[chrm].end)
regionStr = chrm + ":" + str(start) + "-" + str(end)
v_print_timestamp(args.verbose, """Outputting the modified
genome for: """ + regionStr + ".")
if args.fastaFile:
generateFASTAFile(args.fastaFile, regionStr, genome, modOrder,
chrm, start, end, args.suppressBED, tnames,
ambigMap, args.maskRegions,
maskRegionTName, args.maskAllUnsetRegions)
else:
print(getModifiedGenome(genome, modOrder, chrm, start, end,
args.onlyBED, args.suppressBED,
tnames, ambigMap,
args.maskRegions, maskRegionTName,
args.maskAllUnsetRegions))
else:
for chromosome in [chromosome for chromosome in genome
if not re.search(CHROMOSOME_EXCLUSION_REGEX,
chromosome.name)]:
v_print_timestamp(args.verbose, """Outputting the modified
genome for: """ + chromosome.name)
generateFASTAFile(args.fastaFile or _DEFAULT_FASTA_FILENAME,
chromosome.name, genome, modOrder,
chromosome.name, int(chromosome.start),
int(chromosome.end), args.suppressBED, tnames,
ambigMap, args.maskRegions, maskRegionTName,
args.maskAllUnsetRegions)
v_print_timestamp(args.verbose, "Program complete.")