30
30
* AMRFinder
31
31
*
32
32
* Dependencies: NCBI BLAST, HMMer
33
+ * gunzip (optional)
33
34
*
34
35
* Release changes:
36
+ * 05/06/2023 PD-4598 error messages in curl_easy.cpp
37
+ * 3.11.14 05/05/2023 extern "C" { #include <curl.h> }
38
+ * 3.11.13 05/04/2023 PD-4596 Prohibit ASCII characters only between 0x00 and 0x1F in GFF files
39
+ * 04/24/2023 PD-4583 Process files ending with ".gz", see https://github.com/ncbi/amr/issues/61, dependence on gunzip (optional)
40
+ * 04/19/2023 On failure no empty output file (-o) is created
41
+ * 3.11.12 04/13/2023 Application::makeKey()
35
42
* PD-4548 fasta_check.cpp prohibits '\t' (not any '\'), and all restrictions are only for nucleotide sequences
36
43
* 3.11.11 04/13/2023 PD-4566 --hmmer_bin
37
44
* 3.11.10 04/12/2023 PD-4548 fasta_check.cpp prohibits ';', '.', '~' in the last position of a sequence identifier
@@ -470,6 +477,22 @@ struct ThisApplication : ShellApplication
470
477
t. qc ();
471
478
t. saveFile (tmp + " /" + tmpSuf);
472
479
}
480
+
481
+
482
+
483
+ string uncompress (const string "edFName,
484
+ const string &suffix) const
485
+ {
486
+ const string res (shellQuote (tmp + " /" + suffix));
487
+ ASSERT (quotedFName != res);
488
+ const string s (unQuote (quotedFName));
489
+ if (isRight (s, " .gz" ))
490
+ {
491
+ exec (" gunzip -c " + quotedFName + " > " + res);
492
+ return res;
493
+ }
494
+ return quotedFName;
495
+ }
473
496
474
497
475
498
@@ -553,8 +576,11 @@ struct ThisApplication : ShellApplication
553
576
}
554
577
555
578
if (! output. empty ())
579
+ {
556
580
try { OFStream f (output); }
557
581
catch (...) { throw runtime_error (" Cannot create output file " + shellQuote (output)); }
582
+ removeFile (output);
583
+ }
558
584
559
585
560
586
// For timing...
@@ -641,13 +667,9 @@ struct ThisApplication : ShellApplication
641
667
if (! dbDir. items. empty () && dbDir. items. back () == " latest" )
642
668
{
643
669
prog2dir [" amrfinder_update" ] = execDir;
644
- string blast_bin_par;
645
- if (! blast_bin. empty ())
646
- blast_bin_par = " --blast_bin " + shellQuote (blast_bin);
647
- string hmmer_bin_par;
648
- if (! hmmer_bin. empty ())
649
- hmmer_bin_par = " --hmmer_bin " + shellQuote (hmmer_bin);
650
- exec (fullProg (" amrfinder_update" ) + " -d " + shellQuote (dbDir. getParent ()) + ifS (force_update, " --force_update" ) + blast_bin_par + hmmer_bin_par
670
+ exec (fullProg (" amrfinder_update" ) + " -d " + shellQuote (dbDir. getParent ()) + ifS (force_update, " --force_update" )
671
+ + makeKey (" blast_bin" , blast_bin)
672
+ + makeKey (" hmmer_bin" , hmmer_bin)
651
673
+ ifS (quiet, " -q" ) + ifS (qc_on, " --debug" ) + " > " + logFName, logFName);
652
674
}
653
675
else
@@ -759,18 +781,27 @@ struct ThisApplication : ShellApplication
759
781
760
782
for (const string& include : includes)
761
783
stderr << " - include " << include << ' \n ' ;
784
+ }
785
+
786
+
787
+ // Quoted names
788
+ const string prot_flat = uncompress (prot, " prot_flat" );
789
+ const string dna_flat = uncompress (dna, " dna_flat" );
790
+ const string gff_flat = uncompress (gff, " gff_flat" );
791
+
762
792
793
+ {
763
794
StringVector emptyFiles;
764
- if (! emptyArg (prot) && ! getFileSize (unQuote (prot ))) emptyFiles << prot;
765
- if (! emptyArg (dna) && ! getFileSize (unQuote (dna ))) emptyFiles << dna;
766
- if (! emptyArg (gff) && ! getFileSize (unQuote (gff ))) emptyFiles << gff;
795
+ if (! emptyArg (prot) && ! getFileSize (unQuote (prot_flat ))) emptyFiles << prot;
796
+ if (! emptyArg (dna) && ! getFileSize (unQuote (dna_flat ))) emptyFiles << dna;
797
+ if (! emptyArg (gff) && ! getFileSize (unQuote (gff_flat ))) emptyFiles << gff;
767
798
for (const string& emptyFile : emptyFiles)
768
799
{
769
800
const Warning warning (stderr);
770
801
stderr << " Empty file: " << emptyFile;
771
802
}
772
803
}
773
-
804
+
774
805
775
806
// organism --> organism1
776
807
string organism1;
@@ -853,7 +884,7 @@ struct ThisApplication : ShellApplication
853
884
bool lcl = false ;
854
885
if (gffType == Gff::pgap && ! emptyArg (dna)) // PD-3347
855
886
{
856
- LineInput f (unQuote (dna ));
887
+ LineInput f (unQuote (dna_flat ));
857
888
while (f. nextLine ())
858
889
if (isLeft (f. line, " >" ))
859
890
{
@@ -863,6 +894,9 @@ struct ThisApplication : ShellApplication
863
894
}
864
895
865
896
897
+ const bool blastn = ! emptyArg (dna) && ! organism1. empty () && fileExists (db + " /AMR_DNA-" + organism1);
898
+
899
+
866
900
// Create files for amr_report
867
901
string amr_report_blastp;
868
902
string amr_report_blastx;
@@ -881,20 +915,20 @@ struct ThisApplication : ShellApplication
881
915
{
882
916
string gff_prot_match;
883
917
string gff_dna_match;
884
- if (getFileSize (unQuote (prot )))
918
+ if (getFileSize (unQuote (prot_flat )))
885
919
{
886
920
findProg (" blastp" );
887
921
findProg (" hmmsearch" );
888
922
889
- string prot1 (prot ); // Protein FASTA with no dashes in the sequences
923
+ string prot1 (prot_flat ); // Protein FASTA with no dashes in the sequences
890
924
size_t nProt = 0 ;
891
925
size_t protLen_max = 0 ;
892
926
size_t protLen_total = 0 ;
893
- if (! fastaCheck (prot , true , qcS, logFName, nProt, protLen_max, protLen_total))
927
+ if (! fastaCheck (prot_flat , true , qcS, logFName, nProt, protLen_max, protLen_total))
894
928
{
895
929
prot1 = shellQuote (tmp + " /prot" );
896
930
OFStream outF (unQuote (prot1));
897
- LineInput f (unQuote (prot ));
931
+ LineInput f (unQuote (prot_flat ));
898
932
while (f. nextLine ())
899
933
{
900
934
trimTrailing (f. line);
@@ -941,13 +975,13 @@ struct ThisApplication : ShellApplication
941
975
string dnaPar;
942
976
if (! emptyArg (dna))
943
977
{
944
- dnaPar = " -dna " + dna ;
978
+ dnaPar = " -dna " + dna_flat ;
945
979
if (gffType == Gff::pseudomonasdb)
946
980
gff_dna_match = " -gff_dna_match " + tmp + " /dna_match" ;
947
981
}
948
982
try
949
983
{
950
- exec (fullProg (" gff_check" ) + gff + annotS + " -prot " + prot1 + dnaPar + gff_prot_match + gff_dna_match + qcS + " -log " + logFName, logFName);
984
+ exec (fullProg (" gff_check" ) + gff_flat + annotS + " -prot " + prot1 + dnaPar + gff_prot_match + gff_dna_match + qcS + " -log " + logFName, logFName);
951
985
}
952
986
catch (...)
953
987
{
@@ -1004,19 +1038,18 @@ struct ThisApplication : ShellApplication
1004
1038
1005
1039
amr_report_blastp = " -blastp " + tmp + " /blastp -hmmsearch " + tmp + " /hmmsearch -hmmdom " + tmp + " /dom" ;
1006
1040
if (! emptyArg (gff))
1007
- amr_report_blastp += " -gff " + gff + gff_prot_match + gff_dna_match + annotS;
1041
+ amr_report_blastp += " -gff " + gff_flat + gff_prot_match + gff_dna_match + annotS;
1008
1042
}
1009
1043
1010
1044
1011
1045
if (! emptyArg (dna))
1012
1046
{
1013
- const bool blastn = ! organism1. empty () && fileExists (db + " /AMR_DNA-" + organism1);
1014
- if (getFileSize (unQuote (dna)))
1047
+ if (getFileSize (unQuote (dna_flat)))
1015
1048
{
1016
1049
size_t nDna = 0 ;
1017
1050
size_t dnaLen_max = 0 ;
1018
1051
size_t dnaLen_total = 0 ;
1019
- EXEC_ASSERT (fastaCheck (dna , false , qcS, logFName, nDna, dnaLen_max, dnaLen_total));
1052
+ EXEC_ASSERT (fastaCheck (dna_flat , false , qcS, logFName, nDna, dnaLen_max, dnaLen_total));
1020
1053
const string blastx (/* "tblastn"*/ dnaLen_max > 100000 ? " tblastn" : " blastx" ); // PAR // SB-3643
1021
1054
1022
1055
stderr. section (" Running " + blastx);
@@ -1028,14 +1061,14 @@ struct ThisApplication : ShellApplication
1028
1061
const string blastx_par (blastp_par + " -word_size 3 -query_gencode " + to_string (gencode));
1029
1062
ASSERT (threads_max >= 1 );
1030
1063
if (blastx == " blastx" )
1031
- exec (fullProg (" blastx" ) + " -query " + dna + " -db " + tmp + " /db/AMRProt" + " "
1064
+ exec (fullProg (" blastx" ) + " -query " + dna_flat + " -db " + tmp + " /db/AMRProt" + " "
1032
1065
+ blastx_par + " " BLAST_FMT " " + get_num_threads_param (" blastx" , min (nDna, dnaLen_total / 10002 ))
1033
1066
+ " -out " + tmp + " /blastx > /dev/null 2> " + tmp + " /blastx-err" , tmp + " /blastx-err" );
1034
1067
else
1035
1068
{
1036
1069
ASSERT (blastx == " tblastn" );
1037
1070
findProg (" makeblastdb" );
1038
- exec (fullProg (" makeblastdb" ) + " -in " + dna + " -out " + tmp + " /nucl" + " -dbtype nucl -logfile " + tmp + " /makeblastdb.log" , tmp + " /makeblastdb.log" );
1071
+ exec (fullProg (" makeblastdb" ) + " -in " + dna_flat + " -out " + tmp + " /nucl" + " -dbtype nucl -logfile " + tmp + " /makeblastdb.log" , tmp + " /makeblastdb.log" );
1039
1072
if (threads_max > 1 )
1040
1073
{
1041
1074
createDirectory (tmp + " /AMRProt_chunk" );
@@ -1061,7 +1094,7 @@ struct ThisApplication : ShellApplication
1061
1094
findProg (" blastn" );
1062
1095
stderr. section (" Running blastn" );
1063
1096
const Chronometer_OnePass cop (" blastn" , cerr, false , qc_on && ! quiet);
1064
- exec (fullProg (" blastn" ) + " -query " + dna + " -db " + tmp + " /db/AMR_DNA-" + organism1 + " -evalue 1e-20 -dust no -max_target_seqs 10000 "
1097
+ exec (fullProg (" blastn" ) + " -query " + dna_flat + " -db " + tmp + " /db/AMR_DNA-" + organism1 + " -evalue 1e-20 -dust no -max_target_seqs 10000 "
1065
1098
+ get_num_threads_param (" blastn" , min (nDna, dnaLen_total / 2500000 )) + " " BLAST_FMT " -out " + tmp + " /blastn > " + logFName + " 2> " + tmp + " /blastn-err" , tmp + " /blastn-err" );
1066
1099
}
1067
1100
}
@@ -1127,10 +1160,7 @@ struct ThisApplication : ShellApplication
1127
1160
+ ifS (suppress_common, " -suppress_prot " + tmp + " /suppress_prot" )
1128
1161
+ nameS + qcS + " " + parm + " -log " + logFName + " > " + tmp + " /amr" , logFName);
1129
1162
}
1130
- if ( ! emptyArg (dna)
1131
- && ! organism1. empty ()
1132
- && fileExists (db + " /AMR_DNA-" + organism1)
1133
- )
1163
+ if (blastn)
1134
1164
{
1135
1165
const Chronometer_OnePass cop (" dna_mutation" , cerr, false , qc_on && ! quiet);
1136
1166
const string mutation_allS (mutation_all. empty () ? " " : (" -mutation_all " + tmp + " /mutation_all.dna" ));
@@ -1188,7 +1218,7 @@ struct ThisApplication : ShellApplication
1188
1218
if (! emptyArg (dna_out))
1189
1219
{
1190
1220
prepare_fasta_extract (StringVector {" Contig id" , " Start" , " Stop" , " Strand" , " Gene symbol" , " Sequence name" }, " dna_out" , false );
1191
- exec (fullProg (" fasta_extract" ) + dna + " " + tmp + " /dna_out" + qcS + " -log " + logFName + " > " + dna_out, logFName);
1221
+ exec (fullProg (" fasta_extract" ) + dna_flat + " " + tmp + " /dna_out" + qcS + " -log " + logFName + " > " + dna_out, logFName);
1192
1222
}
1193
1223
if (! emptyArg (dnaFlank5_out))
1194
1224
{
@@ -1204,7 +1234,7 @@ struct ThisApplication : ShellApplication
1204
1234
t. saveHeader = false ;
1205
1235
t. qc ();
1206
1236
t. saveFile (tmp + " /dnaFlank5_out" );
1207
- exec (fullProg (" fasta_extract" ) + dna + " " + tmp + " /dnaFlank5_out" + qcS + " -log " + logFName + " > " + dnaFlank5_out, logFName);
1237
+ exec (fullProg (" fasta_extract" ) + dna_flat + " " + tmp + " /dnaFlank5_out" + qcS + " -log " + logFName + " > " + dnaFlank5_out, logFName);
1208
1238
}
1209
1239
1210
1240
0 commit comments