-
Notifications
You must be signed in to change notification settings - Fork 2
/
HG-CoLoR
executable file
·244 lines (226 loc) · 9.39 KB
/
HG-CoLoR
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
#!/bin/bash
set -e
#Prints a help message
function print_help {
echo "Usage: $0 [options] --longreads LR.fasta --shortreads SR.fastq --out resultPrefix -K maxK"
echo ""
echo "Note: HG-CoLoR default parameters are adapted for a 50x coverage set of short reads with a 1% error rate."
echo "Please modify the parameters, in particular the --solid and --bestn ones, as indicated below"
echo "if using a set of short reads with a much higher coverage and/or a highly different error rate."
echo ""
echo " Input:"
echo " LR.fasta: fasta file of long reads, one sequence per line."
echo " SR.fastq: fastq file of short reads."
echo " Warning: only one file must be provided."
echo " If using paired reads, please concatenate them into one single file."
echo " resultPrefix: Prefix of the fasta files where to output the corrected, trim and split long reads."
echo " maxK: Maximum K-mer size of the variable-order de Bruijn graph."
echo ""
echo " Options:"
echo " --minorder INT, -k INT: Minimum k-mer size of the variable-order de Bruijn graph (default: K/2)."
echo " --solid INT, -S INT: Minimum number of occurrences to consider a short read k-mer as solid, after correction (default: 1)."
echo " This parameter should be carefully raised accordingly to the short reads coverage and accuracy,"
echo " and to the chosen maximum order of the graph."
echo " It should only be increased when using high coverage of short reads, or a small maximum order."
echo " --seedsoverlap INT, -o INT: Minimum overlap length to allow the merging of two overlapping seeds (default: K - 1)."
echo " --seedsdistance INT, -d INT: Maximum distance to consider two consecutive seeds for merging (default: 10)."
echo " --branches INT, -b INT: Maximum number of branches exploration (default: 1,250)."
echo " Raising this parameter will result in less split corrected long reads."
echo " However, it will also increase the runtime, and may create chimeric linkings between the seeds."
echo " --seedskips INT, -s INT: Maximum number of seed skips (default: 3)."
echo " --mismatches INT, -m INT: Allowed mismatches when attempting to link two seeds together (default: 3)."
echo " --bestn INT, -n INT: Top alignments to be reported by BLASR (default: 50)."
echo " This parameter should be raised accordingly to the short reads coverage."
echo " Its default value is adapted for a 50x coverage of short reads."
echo " It should be decreased with higher coverage, and increased with lower coverage."
echo " --nproc INT, -j INT: Number of processes to run in parallel (default: number of cores)."
echo " --tmpdir STRING, -t STRING: Path where to store the directory containing temporary files (default: working directory)."
echo " --kmcmem INT, -r INT: Maximum amount of RAM for KMC, in GB (default: 12)."
echo " --help, -h: Print this help message."
exit 1
}
#Set options to default values
longreads=""
LR=""
shortreads=""
SR=""
K=0
seedsoverlap=0
seedsdistance=10
k=0
solid=1
branches=1250
seedskips=3
mismatches=3
bestn=50
kmcmem="12"
nproc=$(nproc)
tmpdir="."
out=""
#Print help if no argument specified
if [[ "$1" == "" ]] ; then
print_help
fi
#Otions handling
while [[ "$1" != "" ]] ; do
case "$1" in
"--help"|"-h")
print_help ;;
"--longreads")
case "$2" in
"") echo "Error: $1 expects an argument" ; exit 1 ;;
*) longreads="$2" ; LR=$(basename "$2") ; shift 2 ;;
esac;;
"--shortreads")
case "$2" in
"") echo "Error: $1 expects an argument" ; exit 1 ;;
*) shortreads="$2" ; SR=$(basename "$2") ; shift 2 ;;
esac ;;
"--out")
case "$2" in
"") echo "Error: $1 expects an argument" ; exit 1 ;;
*) out="$2" ; shift 2 ;;
esac ;;
"--maxorder"|"-K")
case "$2" in
"") echo "Error: $1 expects an argument" ; exit 1 ;;
*) K="$2" ; shift 2 ;;
esac ;;
"--minorder"|"-k")
case "$2" in
"") echo "Error: $1 expects an argument" ; exit 1 ;;
*) k="$2" ; shift 2 ;;
esac ;;
"--solid"|"-S")
case "$2" in
"") echo "Error: $1 expects an argument" ; exit 1 ;;
*) solid="$2" ; shift 2 ;;
esac ;;
"--seedsoverlap"|"-o")
case "$2" in
"") echo "Error: $1 expects an argument" ; exit 1 ;;
*) seedsoverlap="$2" ; shift 2 ;;
esac ;;
"--seedsdistance"|"-d")
case "$2" in
"") echo "Error: $1 expects an argument" ; exit 1 ;;
*) seedsdistance="$2" ; shift 2 ;;
esac ;;
"--branches"|"-b")
case "$2" in
"") echo "Error: $1 expects an argument" ; exit 1 ;;
*) branches="$2" ; shift 2 ;;
esac ;;
"--seedskips"|"-s")
case "$2" in
"") echo "Error: $1 expects an argument" ; exit 1 ;;
*) seedskips="$2" ; shift 2 ;;
esac ;;
"--mismatches"|"-m")
case "$2" in
"") echo "Error: $1 expects an argument" ; exit 1 ;;
*) mismatches="$2" ; shift 2 ;;
esac ;;
"--bestn"|"-n")
case "$2" in
"") echo "Error: $1 expects an argument" ; exit 1 ;;
*) bestn="$2" ; shift 2 ;;
esac ;;
"--nproc"|"-j")
case "$2" in
"") echo "Error: $1 expects an argument" ; exit 1 ;;
*) nproc="$2" ; shift 2 ;;
esac ;;
"--tmpdir"|"-t")
case "$2" in
"") echo "Error: $1 expects an argument" ; exit 1 ;;
*) tmpdir="$2" ; shift 2 ;;
esac ;;
"--kmcmem"|"-r")
case "$2" in
"") echo "Error: $1 expects an argument" ; exit 1 ;;
*) kmcmem="$2" ; shift 2 ;;
esac ;;
--)
shift ; break ;;
*)
echo "Error: invalid option \"$1\"" ; exit 1 ;;
esac
done
#Exit if no short reads, no long reads, no temporary directory, or no output files have been specified
if [[ $LR == "" ]] ; then
echo "Error: --longreads must be specified";
exit 1;
fi
if [[ $SR == "" ]] ; then
echo "Error: --shortreads must be specified";
exit 1;
fi
if [[ $out == "" ]] ; then
echo "Error: --out must be specified";
exit 1;
fi
if [[ $K -eq 0 ]] ; then
echo "Error: -K must be specified";
exit 1;
fi
#If seedsoverlap or minorder haven't been set, set them to default values
if [[ $seedsoverlap -eq 0 ]] ; then
seedsoverlap=$((K-1))
fi
if [[ $k -eq 0 ]] ; then
k=$((K/2))
fi
#Remove the output file if it already exists
if [[ -f $out ]] ; then
rm $out
fi
#Create the directory for temporary files
tmpdir="$tmpdir/HGC_"$$
mkdir -p $tmpdir
#Temporary files names
aln="SR_on_LR.sam"
seeds="seeds"
formatLR="formatted_LR.fa"
#Get the path to HG-CoLoR's folder
hgs=$(readlink -f "$0")
hgf=$(dirname $hgs)
#Run the correction pipeline
echo "["$(date)"] Correcting the short reads"
echo "----- QuorUM -----" >> HG-CoLoR.stdout
echo "----- QuorUM -----" >> HG-CoLoR.stderr
quorum --prefix $tmpdir/corrected_SR -q 33 -t "$nproc" "$shortreads" >> HG-CoLoR.stdout 2>> HG-CoLoR.stderr
echo "["$(date)"] Removing short reads containing weak K-mers"
HGCfilterShortReads.py $tmpdir/corrected_SR.fa "$K" > $tmpdir/long_corrected_SR.fa
echo "----- revseq -----" >> HG-CoLoR.stdout
echo "----- revseq -----" >> HG-CoLoR.stderr
revseq $tmpdir/long_corrected_SR.fa -reverse -complement -outseq $tmpdir/RC_long_corrected_SR.fa >> HG-CoLoR.stdout 2>> HG-CoLoR.stderr
sed -i ':a; $!N; /^>/!s/\n\([^>]\)/\1/; ta; P; D' $tmpdir/RC_long_corrected_SR.fa
cat $tmpdir/long_corrected_SR.fa $tmpdir/RC_long_corrected_SR.fa > $tmpdir/all_long_corrected_SR.fa
echo "----- KMC -----" >> HG-CoLoR.stdout
echo "----- KMC -----" >> HG-CoLoR.stderr
kmc -m"$kmcmem" -k"$K" -ci"$solid" -b -fa $tmpdir/all_long_corrected_SR.fa $tmpdir/mers.db $tmpdir >> HG-CoLoR.stdout 2>> HG-CoLoR.stderr
echo "----- KMC_tools -----" >> HG-CoLoR.stdout
echo "----- KMC_tools -----" >> HG-CoLoR.stderr
kmc_tools filter $tmpdir/mers.db -ci"$solid" $tmpdir/long_corrected_SR.fa -ci1.0 -fa $tmpdir/good_long_corrected_SR.fa -fa >> HG-CoLoR.stdout 2>> HG-CoLoR.stderr
rm $tmpdir/long_corrected_SR.fa $tmpdir/RC_long_corrected_SR.fa $tmpdir/all_long_corrected_SR.fa $tmpdir/corrected_SR.fa
echo "["$(date)"] Building the graph"
echo "----- KMC_dump -----" >> HG-CoLoR.stdout
echo "----- KMC_dump -----" >> HG-CoLoR.stderr
kmc_dump -ci"$solid" $tmpdir/mers.db $tmpdir/"dumped_$K-mers" >> HG-CoLoR.stdout 2>> HG-CoLoR.stderr
cut -f 1 $tmpdir/"dumped_$K-mers" > $tmpdir/"$K-mers.fa"
echo "----- PgSAgen -----" >> HG-CoLoR.stdout
echo "----- PgSAgen -----" >> HG-CoLoR.stderr
PgSAgen $tmpdir/"$K-mers.fa" $tmpdir/"$K-mers.fa" >> HG-CoLoR.stdout 2>> HG-CoLoR.stderr
HGCformatLongReads.py "$longreads" > $tmpdir/"$formatLR"
echo "["$(date)"] Aligning the short reads on the long reads"
echo "----- blasr -----" >> HG-CoLoR.stderr
blasr $tmpdir/good_long_corrected_SR.fa $tmpdir/"$formatLR" --nproc "$nproc" -m 6 --minAlnLength "$K" --bestn "$bestn" | sort -T $tmpdir/ -k 3 --parallel="$nproc" > $tmpdir/"$aln" 2>> HG-CoLoR.stderr
echo "["$(date)"] Generating the corrected long reads"
ulimit -s 65536
HG-CoLoR-correct -t "$tmpdir" -K "$K" -d "$seedsdistance" -o "$seedsoverlap" -k "$k" -b "$branches" -s "$seedskips" -m "$mismatches" -j "$nproc" -r $tmpdir/"$formatLR" -a $tmpdir/"$aln" $tmpdir/"$K-mers.fa.pgsa" > "$out.fasta"
HGCtrim.py "$out.fasta" > "$out.trim.fasta"
HGCsplit.py "$out.trim.fasta" > "$out.split.fasta"
echo "["$(date)"] Removing the temporary files"
rm -Rf $tmpdir
echo "["$(date)"] Exiting"