-
Notifications
You must be signed in to change notification settings - Fork 14
/
Copy pathatram_preprocessor.py
executable file
·179 lines (142 loc) · 6.51 KB
/
atram_preprocessor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
#!/usr/bin/env python3
"""
Start the atram preprocessor.
This wrapper module parses the input arguments and passes them to the module
that does the actual preprocessing (core_preprocessor.py).
"""
import argparse
import os
import textwrap
from datetime import date
from glob import glob
from itertools import chain
from os.path import join
import lib.blast as blast
import lib.db as db
import lib.util as util
from lib.core_preprocessor import preprocess
def parse_command_line():
"""Process command-line arguments."""
description = """
This script prepares data for use by the atram.py
script. It takes fasta or fastq files of paired-end (or
single-end) sequence reads and creates a set of atram
databases.
You need to prepare the sequence read archive files so that the
header lines contain only a sequence ID with the optional
paired-end suffix at the end of the header line. The separator
for the optional trailing paired-end suffix may be a space,
a slash "/", a dot ".", or an underscore "_".
For example:
>DBRHHJN1:427:H9YYAADXX:1:1101:10001:77019/1
GATTAA...
>DBRHHJN1:427:H9YYAADXX:1:1101:10001:77019/2
ATAGCC...
>DBRHHJN1:427:H9YYAADXX:1:1101:10006:63769/2
CGAAAA...
"""
parser = argparse.ArgumentParser(
fromfile_prefix_chars='@', description=textwrap.dedent(description))
parser.add_argument('--version', action='version',
version='%(prog)s {}'.format(db.ATRAM_VERSION))
parser.add_argument(
'--end-1', '-1', metavar='FASTA/Q', action='append',
help="""Sequence read archive files that have only end 1 sequences. The
sequence names do not need an end suffix, we will assume the suffix
is always 1. The files are in fasta or fastq format. You may
repeat this argument or use wildcards.
""")
parser.add_argument(
'--end-2', '-2', metavar='FASTA/Q', action='append',
help="""Sequence read archive files that have only end 2 sequences.
The sequence names do not need an end suffix, we will assume the
suffix is always 2. The files are in fasta or fastq format. You
may repeat this argument or use wildcards.
""")
parser.add_argument(
'--mixed-ends', '-m', metavar='FASTA/Q', action='append',
help="""Sequence read archive files that have a mix of both end 1 and
end 2 sequences (or single ends). The files are in fasta or fastq
format. You may repeat this argument or use wildcards.
""")
parser.add_argument(
'--single-ends', '-0', metavar='FASTA/Q', action='append',
help="""Sequence read archive files that have only unpaired sequences.
Any sequence suffix will be ignored. The files are in fasta or
fastq format. You may repeat this argument or use wildcards.
""")
group = parser.add_argument_group('preprocessor arguments')
blast_db = join('.', 'atram_' + date.today().isoformat())
group.add_argument(
'-b', '--blast-db', '--db', default=blast_db, metavar='DB',
help="""This is the prefix of all of the blast database files. So you
can identify different blast database sets. You may include a
directory as part of the prefix. (default %(default)s)
""".format(blast_db))
cpus = min(10, os.cpu_count() - 4 if os.cpu_count() > 4 else 1)
group.add_argument(
'--cpus', '--processes', '--max-processes', type=int, default=cpus,
help="""Number of CPU threads to use. (default %(default)s)
""".format(cpus))
group.add_argument(
'-t', '--temp-dir', metavar='DIR',
help="""Place temporary files in this directory. All files will be
deleted after aTRAM completes. The directory must exist.""")
group.add_argument(
'--keep-temp-dir', action='store_true',
help="""This flag will keep the temporary files in the --temp-dir
around for debugging.""")
group.add_argument('-l', '--log-file', help="""Log file (full path).""")
group.add_argument(
'--log-level', choices=['debug', 'info', 'error', 'fatal'],
default='info',
help="""Log messages of the given level (or above). 'debug' shows the
most messages and 'fatal' shows the least.
(default %(default)s)""")
group.add_argument(
'-s', '--shards', '--number', type=int, metavar='SHARDS',
dest='shard_count',
help="""Number of blast DB shards to create. The default is to have
each shard contain roughly 250MB of sequence data.""")
group.add_argument(
'--path',
help="""If makeblastdb is not in your $PATH then use this to prepend
directories to your path.""")
group.add_argument(
'--fasta', action='store_true',
help="""Are these fasta files? If you do not specify either --fasta or
--fastq then aTRAM will guess the file type by looking at the last
character of the file name.""")
group.add_argument(
'--fastq', action='store_true',
help="""Are these fastq files? If you do not specify either --fasta or
--fastq then aTRAM will guess the file type by looking at the last
character of the file name.""")
group.add_argument(
'--gzip', action='store_true',
help="""Are these gzip files?""")
group.add_argument(
'--bzip', action='store_true',
help="""Are these bzip files?""")
group.add_argument(
'--shuffle', action='store_true',
help="""Shuffle sequences before putting them into blast files?""")
args = vars(parser.parse_args())
# Prepend to PATH environment variable if requested
if args['path']:
os.environ['PATH'] = '{}:{}'.format(args['path'], os.environ['PATH'])
all_files = []
for ends in ['mixed_ends', 'end_1', 'end_2', 'single_ends']:
if args.get(ends):
end_files = [glob(p) for p in args[ends]]
end_files = sorted(list(chain.from_iterable(end_files)))
args[ends] = end_files
all_files.extend(end_files)
args['shard_count'] = blast.default_shard_count(args, all_files)
blast.make_blast_output_dir(args['blast_db'])
blast.find_program('makeblastdb')
util.temp_dir_exists(args['temp_dir'])
return args
if __name__ == '__main__':
ARGS = parse_command_line()
preprocess(ARGS)