@@ -62,8 +62,8 @@ function alignment::slice(){
62
62
df <- df[srt$ix,];
63
63
64
64
lmax <- max(len);
65
- n <- slices+1 ;
66
- bins <- c( );
65
+ bins <- c(1:nrow(df)) ;
66
+ n <- length(bins );
67
67
i <- 1;
68
68
while(n > slices){
69
69
i <- i+1;
@@ -97,9 +97,6 @@ function alignment::slice(){
97
97
o=" $tdir " /$( basename " $f " )
98
98
o=" ${o% .* } "
99
99
100
- # alignment::_index -1 cmd2 -t $ithreads -f "$f" -o "$o.bai"
101
- # params="-X '$o.bai'"
102
-
103
100
mapfile -t mapdata < <( find " $tmpdir /genome" -maxdepth 1 -type f -name " slice.*.bed" | sort -V)
104
101
printf " %s\n" " ${mapdata[@]} " | sed -E " s@.+\.([0-9]+)\.bed@$o .slice.\1.bam@" > " $o .slices.info"
105
102
_bamslices_slice[" $f " ]=" $o .slices.info"
@@ -264,7 +261,7 @@ function alignment::rmduplicates(){
264
261
CMD
265
262
| awk -v f=<( helper::cat -f " ${umi_rmduplicates[$i]} " | paste - - - -)
266
263
CMD
267
- -v OFS=' \t' ' /^@\S\S\s/{print; next}{l=$0; r="@"$1; getline < f; while(r!=$1){getline < f} print l,"RX:Z:"$(NF-2)}'
264
+ -v OFS=' \t' ' /^@\S\S\s/{print; next}{l=$0; r="@"$1; getline < f; while(r!=$1){getline < f} print l,"RX:Z:"$(NF-2),"QX:Z:"$NF }'
268
265
CMD
269
266
| samtools sort
270
267
-@ $ithreads
@@ -613,14 +610,15 @@ function alignment::clipmateoverlaps(){
613
610
-r <mapper> | array of sorted, indexed bams within array of
614
611
-c <sliceinfo> | array of
615
612
-o <outdir> | path to
613
+ -u | mark fully softclipped mate as unmapped to avoid e.g. downstream GATK BaseRecalibrator issue java.lang.IllegalStateException: cigar is completely soft-clipped
616
614
EOF
617
615
return 1
618
616
}
619
617
620
618
local OPTIND arg mandatory skip=false threads memory maxmemory outdir
621
- declare -n _mapper_clipmateoverlaps _bamslices_clipmateoverlaps
619
+ declare -n _mapper_clipmateoverlaps _bamslices_clipmateoverlaps unmapped=false
622
620
declare -A nidx tidx
623
- while getopts ' S:s:t:m:M:r:c:o:' arg; do
621
+ while getopts ' S:s:t:m:M:r:c:o:u ' arg; do
624
622
case $arg in
625
623
S) $OPTARG && return 0;;
626
624
s) $OPTARG && skip=true;;
@@ -630,6 +628,7 @@ function alignment::clipmateoverlaps(){
630
628
r) (( ++ mandatory)) ; _mapper_clipmateoverlaps=$OPTARG ;;
631
629
c) (( ++ mandatory)) ; _bamslices_clipmateoverlaps=$OPTARG ;;
632
630
o) (( ++ mandatory)) ; outdir=" $OPTARG " ; mkdir -p " $outdir " ;;
631
+ u) unmapped=true;;
633
632
* ) _usage;;
634
633
esac
635
634
done
@@ -646,7 +645,9 @@ function alignment::clipmateoverlaps(){
646
645
local ithreads instances=$(( ${# _mapper_clipmateoverlaps[@]} * ${# _bams_clipmateoverlaps[@]} ))
647
646
read -r instances ithreads < <( configure::instances_by_threads -i $instances -t 10 -T $threads )
648
647
649
- local m i o slice odir
648
+ local m i o slice odir params
649
+ $unmapped && params=' --unmapped' || params=' '
650
+
650
651
declare -a tomerge cmd1 cmd2 cmd3
651
652
for m in " ${_mapper_clipmateoverlaps[@]} " ; do
652
653
declare -n _bams_clipmateoverlaps=$m
@@ -658,13 +659,13 @@ function alignment::clipmateoverlaps(){
658
659
659
660
while read -r slice; do
660
661
# bamutil clips second read even full length, and by default does not produce unmapped reads
661
- # --unmapped marks full length clipped reads as unmapped and does fixmate afterwards
662
- # -> when combined with --excludeFlags (default:0xF0C, better: 0x80C) that includes UNMAP and MUNMAP, pairs with 2nd read fully clipped gets removed
662
+ # --unmapped marks full length clipped reads as unmapped and fixes both mates flags.
663
663
# use stdout with bam extension to enfoce bam output
664
664
commander::makecmd -a cmd1 -s ' ;' -c {COMMANDER[0]}<< - CMD
665
665
bam clipOverlap
666
666
--in "$slice "
667
667
--out -.bam
668
+ $params
668
669
--excludeFlags 0x0
669
670
--poolSize $poolsize
670
671
--stats
@@ -1407,15 +1408,18 @@ function alignment::bqsr(){
1407
1408
1408
1409
commander::printinfo " base quality score recalibration"
1409
1410
1410
- local minstances mthreads jmem jgct jcgct
1411
+ local minstances mthreads jmem jgct jcgct minstances2 mthreads2 jmem2 jgct2 jcgct2
1411
1412
read -r minstances mthreads jmem jgct jcgct < <( configure::jvm -T $threads -m $memory -M " $maxmemory " )
1412
1413
1413
1414
declare -n _bams_bqsr=" ${_mapper_bqsr[0]} "
1414
1415
local ithreads instances=$(( ${# _mapper_bqsr[@]} * ${# _bams_bqsr[@]} ))
1415
- read -r instances ithreads < <( configure::instances_by_threads -i $instances -t 10 -T $threads )
1416
+ read -r minstances2 mthreads2 jmem2 jgct2 jcgct2 < <( configure::jvm -i $instances -T $threads -M " $maxmemory " ) # for gatk simple tasks like gathering
1417
+ read -r instances ithreads < <( configure::instances_by_threads -i $instances -t 10 -T $threads ) # for samtools
1416
1418
1417
- local m i o slice odir
1418
- declare -a tdirs tomerge cmd1 cmd2 cmd3 cmd4
1419
+ local m i o slice odir params
1420
+ [[ -s " $genome .InDel_gold.vcf.gz" ]] && params=" --known-sites '$dbsnp ' --known-sites '$genome .InDel_gold.vcf.gz'" || params=" --known-sites '$dbsnp '"
1421
+
1422
+ declare -a reportfile tdirs tomerge cmd1 cmd2 cmd3 cmd4 cmd5
1419
1423
for m in " ${_mapper_bqsr[@]} " ; do
1420
1424
declare -n _bams_bqsr=$m
1421
1425
odir=" $outdir /$m "
@@ -1424,6 +1428,8 @@ function alignment::bqsr(){
1424
1428
for i in " ${! _bams_bqsr[@]} " ; do
1425
1429
tomerge=()
1426
1430
1431
+ reportfile=" $( mktemp -p " $tmpdir " cleanup.XXXXXXXXXX.bqsreport) "
1432
+
1427
1433
while read -r slice; do
1428
1434
# https://gatkforums.broadinstitute.org/gatk/discussion/7131/is-indel-realignment-removed-from-gatk4
1429
1435
# https://software.broadinstitute.org/gatk/blog?id=7847
@@ -1449,32 +1455,33 @@ function alignment::bqsr(){
1449
1455
'
1450
1456
BaseRecalibrator
1451
1457
-R "$genome "
1452
- --known-sites " $dbsnp "
1458
+ $params
1453
1459
-I "$slice "
1454
1460
-O "$slice .bqsreport"
1455
1461
-verbosity ERROR
1456
1462
--tmp-dir "${tdirs[-1]} "
1457
1463
CMD
1458
1464
1459
- commander::makecmd -a cmd2 -s ' ;' -c {COMMANDER[0]}<< - CMD {COMMANDER[1]}<<- CMD
1465
+ tdirs+=(" $( mktemp -d -p " $tmpdir " cleanup.XXXXXXXXXX.gatk) " )
1466
+ commander::makecmd -a cmd3 -s ' ;' -c {COMMANDER[0]}<< - CMD {COMMANDER[1]}<<- CMD
1460
1467
rm -rf "$slice .bqsr.parts"
1461
1468
CMD
1462
- gatk
1469
+ MALLOC_ARENA_MAX=4 gatk
1463
1470
--java-options '
1464
1471
-Xmx${jmem}m
1465
1472
-XX:ParallelGCThreads=$jgct
1466
1473
-XX:ConcGCThreads=$jcgct
1467
1474
-Djava.io.tmpdir="$tmpdir"
1468
1475
'
1469
1476
ApplyBQSR
1470
- -bqsr " $slice .bqsreport "
1477
+ -bqsr " $reportfile "
1471
1478
-I " $slice "
1472
1479
-O " $slice .bqsr"
1473
1480
-verbosity ERROR
1474
1481
--tmp-dir " ${tdirs[-1]} "
1475
1482
CMD
1476
1483
1477
- commander::makecmd -a cmd3 -s ' ;' -c {COMMANDER[0]}<< - CMD {COMMANDER[1]}<<- CMD
1484
+ commander::makecmd -a cmd4 -s ' ;' -c {COMMANDER[0]}<< - CMD {COMMANDER[1]}<<- CMD
1478
1485
mv "$slice .bqsr" "$slice "
1479
1486
CMD
1480
1487
samtools index -@ $ithreads " $slice " " ${slice% .* } .bai"
@@ -1483,10 +1490,25 @@ function alignment::bqsr(){
1483
1490
tomerge+=(" $slice " )
1484
1491
done < " ${_bamslices_bqsr[${_bams_bqsr[$i]} ]}"
1485
1492
1493
+ tdirs+=(" $( mktemp -d -p " $tmpdir " cleanup.XXXXXXXXXX.gatk) " )
1494
+ commander::makecmd -a cmd2 -s ' ;' -c {COMMANDER[0]}<< - CMD
1495
+ MALLOC_ARENA_MAX=4 gatk
1496
+ --java-options '
1497
+ -Xmx${jmem2} m
1498
+ -XX:ParallelGCThreads=$jgct2
1499
+ -XX:ConcGCThreads=$jcgct2
1500
+ -Djava.io.tmpdir="$tmpdir "
1501
+ '
1502
+ GatherBQSRReports
1503
+ $( printf -- ' -I "%s" ' " ${tomerge[@]/%/ .bqsreport} " )
1504
+ -O "$reportfile "
1505
+ --tmp-dir "${tdirs[-1]} "
1506
+ CMD
1507
+
1486
1508
o=" $odir /$( basename " ${_bams_bqsr[$i]} " ) "
1487
1509
o=" ${o% .* } .bqsr.bam"
1488
1510
# slices have full sam header info used by merge to maintain the global sort order
1489
- commander::makecmd -a cmd4 -s ' |' -c {COMMANDER[0]}<< - CMD
1511
+ commander::makecmd -a cmd5 -s ' |' -c {COMMANDER[0]}<< - CMD
1490
1512
samtools merge
1491
1513
-@ $ithreads
1492
1514
-f
@@ -1509,11 +1531,13 @@ function alignment::bqsr(){
1509
1531
commander::printcmd -a cmd2
1510
1532
commander::printcmd -a cmd3
1511
1533
commander::printcmd -a cmd4
1534
+ commander::printcmd -a cmd5
1512
1535
else
1513
1536
commander::runcmd -c gatk -v -b -i $minstances -a cmd1
1514
- commander::runcmd -c gatk -v -b -i $minstances -a cmd2
1515
- commander::runcmd -v -b -i $instances -a cmd3
1537
+ commander::runcmd -c gatk -v -b -i $minstances2 -a cmd2
1538
+ commander::runcmd -c gatk - v -b -i $minstances -a cmd3
1516
1539
commander::runcmd -v -b -i $instances -a cmd4
1540
+ commander::runcmd -v -b -i $instances -a cmd5
1517
1541
fi
1518
1542
1519
1543
return 0
0 commit comments