Skip to content

Commit 01dbead

Browse files
committed
Merge branch 'dev'
2 parents 185a69f + c0f7904 commit 01dbead

15 files changed

+475
-355
lines changed

amr_report.cpp

Lines changed: 23 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -334,6 +334,7 @@ map <string/*accession*/, Susceptible> accession2susceptible;
334334

335335
bool cdsExist = false;
336336
bool print_node = false;
337+
bool print_node_raw = false;
337338

338339
bool reportPseudo = false;
339340
//const string stopCodonS ("[stop]");
@@ -698,7 +699,12 @@ struct BlastAlignment : Alignment
698699
}
699700
}
700701
if (print_node)
701-
td << famId;
702+
{
703+
if (! print_node_raw && allele () && ! refExactlyMatched ())
704+
td << gene;
705+
else
706+
td << famId;
707+
}
702708
IMPLY ((isMutationProt () && ! seqChange. empty () && mut && ! seqChange. replacement), hasMutation ());
703709
if ( ! isMutationProt ()
704710
|| (! seqChange. empty () && mut && ! seqChange. replacement) // resistant mutation
@@ -808,7 +814,7 @@ struct BlastAlignment : Alignment
808814
}
809815
public:
810816
string fusion2geneSymbols () const
811-
{ ASSERT (! isMutationProt ());
817+
{ ASSERT (! isMutationProt ());
812818
if (fusions. empty ())
813819
return getGeneSymbol ();
814820
string s;
@@ -954,15 +960,14 @@ struct BlastAlignment : Alignment
954960
&& refCoverage () >= br. ref_coverage - frac_delta
955961
;
956962
}
963+
bool partialPseudo () const
964+
{ return partial ()
965+
&& ! cdss. empty ()
966+
&& ! truncatedCds ();
967+
}
957968
bool pseudo () const
958-
{ if (stopCodon)
959-
return true;
960-
if ( partial ()
961-
&& ! cdss. empty ()
962-
&& ! truncatedCds ()
963-
)
964-
return true;
965-
return false;
969+
{ return stopCodon
970+
|| partialPseudo ();
966971
}
967972
bool good () const
968973
{ if (refAccession. empty ())
@@ -999,7 +1004,7 @@ struct BlastAlignment : Alignment
9991004
&& targetEnd <= other. targetEnd + mismatchTailTarget ()
10001005
)
10011006
return true;
1002-
if ( ! pseudo ()
1007+
if ( ! partialPseudo ()
10031008
|| isMutationProt ()
10041009
|| other. isMutationProt ()
10051010
|| fusion2geneSymbols () != other. fusion2geneSymbols ()
@@ -1102,6 +1107,7 @@ struct BlastAlignment : Alignment
11021107
)
11031108
if ( ! targetProt
11041109
|| ( ! isMutationProt () // PD-4722
1110+
&& ! other. isMutationProt () // PD-4755
11051111
&& fusion2geneSymbols () != other. fusion2geneSymbols ()
11061112
)
11071113
) // PD-4687
@@ -1397,7 +1403,7 @@ struct Batch
13971403
}
13981404
catch (const exception &e)
13991405
{
1400-
throw runtime_error ("Cannot read " + famFName +", line " + toString (f. lineNum) + "\n" + e. what ());
1406+
throw runtime_error ("Cannot read " + famFName +", " + f. lineStr () + "\n" + e. what ());
14011407
}
14021408
}
14031409
{
@@ -2150,7 +2156,8 @@ struct ThisApplication : Application
21502156

21512157
// Output
21522158
addKey ("out", "Identifiers of the reported input proteins");
2153-
addFlag ("print_node", "Print FAM.id");
2159+
addFlag ("print_node", "Print FAM.id replaced by FAM.parent for non-exact allele matches");
2160+
addFlag ("print_node_raw", "Print FAM.id");
21542161
addFlag ("pseudo", "Indicate pseudo-genes as method INTERNAL_STOP"); // or FRAME_SHIFT
21552162
addFlag ("force_cds_report", "Report contig/start/stop/strand even if this information does not exist");
21562163
addFlag ("non_reportable", "Report non-reportable families");
@@ -2195,6 +2202,7 @@ struct ThisApplication : Application
21952202
equidistant = getFlag ("report_equidistant");
21962203
const string outFName = getArg ("out");
21972204
print_node = getFlag ("print_node");
2205+
print_node_raw = getFlag ("print_node_raw");
21982206
reportPseudo = getFlag ("pseudo");
21992207
const bool force_cds_report = getFlag ("force_cds_report");
22002208
const bool non_reportable = getFlag ("non_reportable");
@@ -2221,6 +2229,8 @@ struct ThisApplication : Application
22212229
QC_ASSERT (! lcl);
22222230
QC_ASSERT (! bedP);
22232231
}
2232+
2233+
QC_IMPLY (print_node_raw, print_node);
22242234

22252235

22262236
if (ident_min == -1.0)

amrfinder.cpp

Lines changed: 64 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -33,8 +33,15 @@
3333
* gunzip (optional)
3434
*
3535
* Release changes:
36+
* 3.11.26 10/16/2023 PD-4772 Remove prodigal GFF format from AMRFinderPlus
37+
* 3.11.25 10/13/2023 PD-4771 Revert removing '*' from Prodigal output to ensure ALLELEP and EXACTP matches ??
38+
* 3.11.24 10/12/2023 PD-4769 --print_node prints FAM.id replaced by FAM.parent for non-exact allele matches
39+
* 3.11.23 10/06/2023 PD-4764 Remove '*' from Prodigal output to ensure ALLELEP and EXACTP matches
40+
* 10/05/2023 PD-4761 Remove protein sequences with >= 20 Xs
41+
* 3.11.22 10/05/2023 PD-4754 Prodigal GFF
42+
* 3.11.21 10/02/2023 PD-4755 bug: calling fusion2geneSymbols() for a mutation protein
3643
* 3.11.20 09/06/2023 PD-4722 bug: calling fusion2geneSymbols() for a mutation protein
37-
* TERM codes are printed only when output is to screen
44+
* color codes are printed only when output is to screen
3845
* 3.11.19 08/09/2023 PD-4698 if a pseudogene is overlapped by a different gene on the length >= 20 aa with the same gene symbol then the pseudogene is not reported
3946
* 08/04/2023 PD-4706 protein match overrides a nucleotide match for point mutations
4047
* 3.11.18 07/25/2023 parameter order in instruction; "can be gzipped" is added to help
@@ -281,6 +288,7 @@ constexpr size_t threads_def = 4;
281288
// Cf. amr_report.cpp
282289
constexpr double ident_min_def = 0.9;
283290
constexpr double partial_coverage_min_def = 0.5;
291+
const string ambigS ("20");
284292

285293

286294
#define HELP \
@@ -310,7 +318,10 @@ struct ThisApplication : ShellApplication
310318
addKey ("protein", "Input protein FASTA file (can be gzipped)", "", 'p', "PROT_FASTA");
311319
addKey ("nucleotide", "Input nucleotide FASTA file (can be gzipped)", "", 'n', "NUC_FASTA");
312320
addKey ("gff", "GFF file for protein locations (can be gzipped). Protein id should be in the attribute 'Name=<id>' (9th field) of the rows with type 'CDS' or 'gene' (3rd field).", "", 'g', "GFF_FILE");
313-
addKey ("annotation_format", "Type of GFF file: " + Gff::names. toString (", "), "genbank", 'a', "ANNOTATION_FORMAT");
321+
322+
string annots (Gff::names. toString (", "));
323+
replaceStr (annots, ", prodigal", ""); // PD-4772 ??
324+
addKey ("annotation_format", "Type of GFF file: " + annots, "genbank", 'a', "ANNOTATION_FORMAT");
314325

315326
addKey ("database", "Alternative directory with AMRFinder database. Default: $AMRFINDER_DB", "", 'd', "DATABASE_DIR");
316327
addFlag ("database_version", "Print database version", 'V');
@@ -359,31 +370,25 @@ struct ThisApplication : ShellApplication
359370

360371

361372

362-
bool fastaCheck (const string &fName,
373+
void fastaCheck (const string &fName,
363374
bool prot,
364375
const string &qcS,
365376
const string &logFName,
366377
size_t &nSeq,
367378
size_t &len_max,
368-
size_t &len_total) const
369-
// Input: fName: quoted
370-
// Return: false <=> hyphen in protein FASTA
379+
size_t &len_total,
380+
const string &outFName) const
381+
// Input: fName, outFName: quoted
371382
{
372-
try
383+
ASSERT (fName != logFName);
384+
if (! outFName. empty ())
373385
{
374-
exec (fullProg ("fasta_check") + fName + " " + (prot ? "-aa" : "-len "+ tmp + "/len") + (prot ? "" : " -hyphen") + qcS + " -log " + logFName + " > " + tmp + "/nseq", logFName);
375-
}
376-
catch (...)
377-
{
378-
if (prot)
379-
{
380-
LineInput f (logFName);
381-
while (f. nextLine ())
382-
if (contains (f. line, "hyphen in the sequence")) // Cf. fasta_check.cpp
383-
return false;
384-
}
385-
throw;
386-
}
386+
ASSERT (outFName != fName);
387+
ASSERT (outFName != logFName);
388+
}
389+
exec (fullProg ("fasta_check") + fName + " " + (prot ? "-aa -stop_codon -ambig_max " + ambigS + prependS (outFName, " -out ") : "-len " + tmp + "/len -hyphen -ambig") + qcS + " -log " + logFName + " > " + tmp + "/nseq", logFName);
390+
// "-stop_codon" PD-4771 ??
391+
387392
const StringVector vec (tmp + "/nseq", (size_t) 10, true);
388393
if (vec. size () != 3)
389394
throw runtime_error (string (prot ? "Protein" : "DNA") + " fasta_check failed: " + vec. toString ("\n"));
@@ -393,7 +398,6 @@ struct ThisApplication : ShellApplication
393398
QC_ASSERT (nSeq);
394399
QC_ASSERT (len_max);
395400
QC_ASSERT (len_total);
396-
return true;
397401
}
398402

399403

@@ -942,26 +946,45 @@ struct ThisApplication : ShellApplication
942946
size_t nProt = 0;
943947
size_t protLen_max = 0;
944948
size_t protLen_total = 0;
945-
if (! fastaCheck (prot_flat, true, qcS, logFName, nProt, protLen_max, protLen_total))
949+
950+
try
946951
{
952+
fastaCheck (prot_flat, true, qcS, logFName, nProt, protLen_max, protLen_total, noString);
953+
}
954+
catch (...)
955+
{
956+
bool fixable = false;
957+
{
958+
LineInput f (logFName);
959+
while (f. nextLine ())
960+
// Cf. fasta_check.cpp
961+
if (contains (f. line, "Hyphen in the sequence"))
962+
{
963+
const Warning warning (stderr);
964+
stderr << "Ignoring dash '-' characters in the sequences of the protein file " << prot;
965+
fixable = true;
966+
break;
967+
}
968+
else if (contains (f. line, "Too many ambiguities"))
969+
{
970+
const Warning warning (stderr);
971+
stderr << "Removing sequences with >= " << ambigS << " Xs from the protein file " << prot;
972+
fixable = true;
973+
break;
974+
}
975+
else if (contains (f. line, "'*' at the sequence end"))
976+
{
977+
const Warning warning (stderr);
978+
stderr << "Removing '*' from the ends of protein sequences in " << prot;
979+
fixable = true;
980+
break;
981+
}
982+
}
983+
if (! fixable)
984+
throw;
947985
prot1 = shellQuote (tmp + "/prot");
948-
OFStream outF (unQuote (prot1));
949-
LineInput f (unQuote (prot_flat));
950-
while (f. nextLine ())
951-
{
952-
trimTrailing (f. line);
953-
if (f. line. empty ())
954-
continue;
955-
if (f. line [0] != '>')
956-
replaceStr (f. line, "-", "");
957-
outF << f. line << endl;
958-
}
959-
{
960-
const Warning warning (stderr);
961-
stderr << "Ignoring dash '-' characters in the sequences of the protein file " << prot;
962-
}
963-
EXEC_ASSERT (fastaCheck (prot1, true, qcS, logFName, nProt, protLen_max, protLen_total));
964-
}
986+
fastaCheck (prot_flat, true, qcS, logFName, nProt, protLen_max, protLen_total, prot1);
987+
}
965988

966989
// gff_check
967990
if (! emptyArg (gff) && ! contains (parm, "-bed"))
@@ -984,6 +1007,7 @@ struct ThisApplication : ShellApplication
9841007
}
9851008
break;
9861009
case Gff::microscope: gffProtMatchP = true; break;
1010+
case Gff::prodigal: gffProtMatchP = true; break;
9871011
default: break;
9881012
}
9891013
if (gffProtMatchP)
@@ -1067,7 +1091,7 @@ struct ThisApplication : ShellApplication
10671091
size_t nDna = 0;
10681092
size_t dnaLen_max = 0;
10691093
size_t dnaLen_total = 0;
1070-
EXEC_ASSERT (fastaCheck (dna_flat, false, qcS, logFName, nDna, dnaLen_max, dnaLen_total));
1094+
/*EXEC_ASSERT (*/ fastaCheck (dna_flat, false, qcS, logFName, nDna, dnaLen_max, dnaLen_total, noString); // );
10711095
const string blastx (/*"tblastn"*/ dnaLen_max > 100000 ? "tblastn" : "blastx"); // PAR // SB-3643
10721096

10731097
stderr. section ("Running " + blastx);

0 commit comments

Comments
 (0)