muellan
diff --git a/‎dev/make_docs.sh
Lines changed: 7 additions & 6 deletions b/‎dev/make_docs.sh
Lines changed: 7 additions & 6 deletions
diff --git a/‎docs/mode_build.txt
Lines changed: 0 additions & 3 deletions b/‎docs/mode_build.txt
Lines changed: 0 additions & 3 deletions
diff --git a/‎docs/mode_build_query.txt
Lines changed: 367 additions & 0 deletions b/‎docs/mode_build_query.txt
Lines changed: 367 additions & 0 deletions
@@ -1,9 +1,10 @@
 #!/bin/bash
 
-./metacache help build  > docs/mode_build.txt
-./metacache help modify > docs/mode_modify.txt
-./metacache help query  > docs/mode_query.txt
-./metacache help merge  > docs/mode_merge.txt
-./metacache help info   > docs/mode_info.txt
-./metacache help help   > docs/mode_help.txt
+./metacache help build       > docs/mode_build.txt
+./metacache help modify      > docs/mode_modify.txt
+./metacache help query       > docs/mode_query.txt
+./metacache help build+query > docs/mode_build_query.txt
+./metacache help merge       > docs/mode_merge.txt
+./metacache help info        > docs/mode_info.txt
+./metacache help help        > docs/mode_help.txt
 
@@ -110,9 +110,6 @@ ADVANCED OPTIONS
                       contains a separate hash table.
                       default: 1
 
-    -query            Run interactive query after building database.
-                      default: off
-
 EXAMPLES
 
     Build database 'mydb' from sequence file 'genomes.fna':
 
@@ -0,0 +1,367 @@
+SYNOPSIS
+
+    metacache build+query -targets <sequence file/directory>... [OPTION]...
+
+    metacache build+query [OPTION]... -targets <sequence file/directory>...
+
+    metacache build+query -targets <sequence file/directory>... -query <sequence file/directory>... [OPTION]...
+
+    metacache build+query -targets <sequence file/directory>... [OPTION]... -query <sequence file/directory>...
+
+    metacache build+query [OPTION]... -targets <sequence file/directory>... -query <sequence file/directory>...
+
+
+DESCRIPTION
+
+    Create a new database of reference sequences (usually genomic sequences) and use it to map (other) sequences to their most likely taxon of origin.
+
+
+REQUIRED PARAMETERS
+
+    <sequence file/directory>...
+                      FASTA or FASTQ files containing genomic sequences
+                      (complete genomes, scaffolds, contigs, ...) that shall
+                      beused as representatives of an organism/taxon.
+                      If directory names are given, they will be searched for
+                      sequence files (at most 10 levels deep).
+
+
+
+BASIC OPTIONS
+
+    -taxonomy <path>  directory with taxonomic hierarchy data (see NCBI's
+                      taxonomic data files)
+
+    -taxpostmap <file>
+                      Files with sequence to taxon id mappings that are used as
+                      alternative source in a post processing step.
+                      default: 'nucl_(gb|wgs|est|gss).accession2taxid'
+
+    -silent|-verbose  information level during build:
+                      silent => none / verbose => most detailed
+                      default: neither => only errors/important info
+
+
+SKETCHING (SUBSAMPLING)
+
+    -kmerlen <k>      number of nucleotides/characters in a k-mer
+                      default: 16
+
+    -sketchlen <s>    number of features (k-mer hashes) per sampling window
+                      default: 16
+
+    -winlen <w>       number of letters in each sampling window
+                      default: 127
+
+    -winstride <l>    distance between window starting positions
+                      default: 112 (w-k+1)
+
+
+ADVANCED OPTIONS
+
+    -reset-taxa       Attempts to re-rank all sequences after the main build
+                      phase using '.accession2taxid' files. This will reset the
+                      taxon id of a reference sequence even if a taxon id could
+                      be obtained from other sources during the build phase.
+                      default: off
+
+    -max-locations-per-feature <#>
+                      maximum number of reference sequence locations to be
+                      stored per feature;
+                      If the value is too high it will significantly impact
+                      querying speed. Note that an upper hard limit is always
+                      imposed by the data type used for the hash table bucket
+                      size (set with compilation macro
+                      '-DMC_LOCATION_LIST_SIZE_TYPE').
+                      default: 254
+
+    -remove-overpopulated-features
+                      Removes all features that have reached the maximum allowed
+                      amount of locations per feature. This can improve querying
+                      speed and can be used to remove non-discriminative
+                      features.
+                      default: off
+
+    -remove-ambig-features <rank>
+                      Removes all features that have more distinct reference
+                      sequence on the given taxonomic rank than set by
+                      '-max-ambig-per-feature'. This can decrease the database
+                      size significantly at the expense of sensitivity. Note
+                      that the lower the given taxonomic rank is, the more
+                      pronounced the effect will be.
+                      Valid values: sequence, form, variety, subspecies,
+                      species, subgenus, genus, subtribe, tribe, subfamily,
+                      family, suborder, order, subclass, class, subphylum,
+                      phylum, subkingdom, kingdom, domain
+                      default: off
+
+    -max-ambig-per-feature <#>
+                      Maximum number of allowed different reference sequence
+                      taxa per feature if option '-remove-ambig-features' is
+                      used.
+
+    -max-load-fac <factor>
+                      maximum hash table load factor;
+                      This can be used to trade off larger memory consumption
+                      for speed and vice versa. A lower load factor will improve
+                      speed, a larger one will improve memory efficiency.
+                      default: 0.800000
+
+    -parts <#>        Splits the database into multiple parts. Each part
+                      contains a separate hash table.
+                      default: 1
+
+    -save-db <database filename>
+                      Save database to disk after querying.
+
+
+QUERY PARAMETERS
+
+    <sequence file/directory>...
+                      FASTA or FASTQ files containing genomic sequences (short
+                      reads, long reads, contigs, complete genomes, ...) that
+                      shall be classified.
+                      * If directory names are given, they will be searched for
+                      sequence files (at most 10 levels deep).
+                      * If no input filenames or directories are given,
+                      MetaCache will run in interactive query mode. This can be
+                      used to load the database into memory only once and then
+                      query it multiple times with different query options.
+
+
+MAPPING RESULTS OUTPUT
+
+    -out <file>       Redirect output to file <file>.
+                      If not specified, output will be written to stdout. If
+                      more than one input file was given all output will be
+                      concatenated into one file.
+
+
+    -split-out <file> Generate output and statistics for each input file
+                      separately. For each input file <in> an output file with
+                      name <file>_<in> will be written.
+
+
+PAIRED-END READ HANDLING
+
+    -pairfiles        Interleave paired-end reads from two consecutive files, so
+                      that the nth read from file m and the nth read from file
+                      m+1 will be treated as a pair. If more than two files are
+                      provided, their names will be sorted before processing.
+                      Thus, the order defined by the filenames determines the
+                      pairing not the order in which they were given in the
+                      command line.
+
+
+    -pairseq          Two consecutive sequences (1+2, 3+4, ...) from each file
+                      will be treated as paired-end reads.
+
+
+    -insertsize <#>   Maximum insert size to consider.
+                      default: sum of lengths of the individual reads
+
+
+CLASSIFICATION
+
+    -lowest <rank>    Do not classify on ranks below <rank>
+                      (Valid values: sequence, form, variety, subspecies,
+                      species, subgenus, genus, subtribe, tribe, subfamily,
+                      family, suborder, order, subclass, class, subphylum,
+                      phylum, subkingdom, kingdom, domain)
+                      default: sequence
+
+    -highest <rank>   Do not classify on ranks above <rank>
+                      (Valid values: sequence, form, variety, subspecies,
+                      species, subgenus, genus, subtribe, tribe, subfamily,
+                      family, suborder, order, subclass, class, subphylum,
+                      phylum, subkingdom, kingdom, domain)
+                      default: domain
+
+    -hitmin <t>       Sets classification threshhold to <t>.
+                      A read will not be classified if less than t features from
+                      the database match. Higher values will increase precision
+                      at the expense of sensitivity.
+                      default: 0
+
+    -hitdiff <t>      Sets classification threshhold to <t>.
+                      A read will not be classified if less than t features from
+                      the database match. Higher values will increase precision
+                      at the expense of sensitivity.
+                      default: 0
+
+    -maxcand <#>      maximum number of reference taxon candidates to consider
+                      for each query;
+                      A large value can significantly decrease the querying
+                      speed!.
+                      default: 2
+
+    -cov-percentile <p>
+                      Remove the p-th percentile of hit reference sequences with
+                      the lowest coverage. Classification is done using only the
+                      remaining reference sequences. This can help to reduce
+                      false positives, especially whenyour input data has a high
+                      sequencing coverage.
+                      This feature decreases the querying speed!
+                      default: off
+
+
+GENERAL OUTPUT FORMATTING
+
+    -no-summary       Dont't show result summary & mapping statistics at the end
+                      of the mapping output
+                      default: off
+
+    -no-query-params  Don't show query settings at the beginning of the mapping
+                      output
+                      default: off
+
+    -no-err           Suppress all error messages.
+                      default: off
+
+
+CLASSIFICATION RESULT FORMATTING
+
+    -no-map           Don't report classification for each individual query
+                      sequence; show summaries only (useful for quick tests).
+                      default: off
+
+    -mapped-only      Don't list unclassified reads/read pairs.
+                      default: off
+
+    -taxids           Print taxon ids in addition to taxon names.
+                      default: off
+
+    -taxids-only      Print taxon ids instead of taxon names.
+                      default: off
+
+    -omit-ranks       Do not print taxon rank names.
+                      default: off
+
+    -separate-cols    Prints *all* mapping information (rank, taxon name, taxon
+                      ids) in separate columns (see option '-separator').
+                      default: off
+
+    -separator <text> Sets string that separates output columns.
+                      default: '\t|\t'
+
+    -comment <text>   Sets string that precedes comment (non-mapping) lines.
+                      default: '# '
+
+    -queryids         Show a unique id for each query.
+                      Note that in paired-end mode a query is a pair of two read
+                      sequences. This option will always be activated if option
+                      '-hits-per-ref' is given.
+                      default: off
+
+    -lineage          Report complete lineage for per-read classification
+                      starting with the lowest rank found/allowed and ending
+                      with the highest rank allowed. See also options '-lowest'
+                      and '-highest'.
+                      default: off
+
+
+ANALYSIS: ABUNDANCES
+
+    -abundances <file>
+                      Show absolute and relative abundance of each taxon.
+                      If a valid filename is given, the list will be written to
+                      this file.
+                      default: off
+
+    -abundance-per <rank>
+                      Show absolute and relative abundances for each taxon on
+                      one specific rank.
+                      Classifications on higher ranks will be estimated by
+                      distributing them down according to the relative
+                      abundances of classifications on or below the given rank.
+                      (Valid values: sequence, form, variety, subspecies,
+                      species, subgenus, genus, subtribe, tribe, subfamily,
+                      family, suborder, order, subclass, class, subphylum,
+                      phylum, subkingdom, kingdom, domain)
+                      If '-abundances <file>' was given, this list will be
+                      printed to the same file.
+                      default: off
+
+
+ANALYSIS: RAW DATABASE HITS
+
+    -tophits          For each query, print top feature hits in database.
+                      default: off
+
+    -allhits          For each query, print all feature hits in database.
+                      default: off
+
+    -locations        Show locations in candidate reference sequences.
+                      Activates option '-tophits'.
+                      default: off
+
+    -hits-per-ref <file>
+                      Shows a list of all hits for each reference sequence.
+                      If this condensed list is all you need, you should
+                      deactive the per-read mapping output with '-no-map'.
+                      If a valid filename is given after '-hits-per-ref', the
+                      list will be written to a separate file.
+                      Option '-queryids' will be activated and the lowest
+                      classification rank will be set to 'sequence'.
+                      default: off
+
+
+ANALYSIS: ALIGNMENTS
+
+    -align            Show semi-global alignment to best candidate reference
+                      sequence.
+                      Original files of reference sequences must be available.
+                      This feature decreases the querying speed!
+                      default: off
+
+
+ADVANCED: GROUND TRUTH BASED EVALUATION
+
+    -ground-truth     Report correct query taxa if known.
+                      Queries need to have either a 'taxid|<number>' entry in
+                      their header or a sequence id that is also present in the
+                      database.
+                      This feature decreases the querying speed!
+                      default: off
+
+    -precision        Report precision & sensitivity by comparing query taxa
+                      (ground truth) and mapped taxa.
+                      Queries need to have either a 'taxid|<number>' entry in
+                      their header or a sequence id that is also found in the
+                      database.
+                      This feature decreases the querying speed!
+                      default: off
+
+    -taxon-coverage   Report true/false positives and true/false negatives.This
+                      option turns on '-precision', so ground truth data needs
+                      to be available.
+                      This feature decreases the querying speed!
+                      default: off
+
+
+ADVANCED: PERFORMANCE TUNING / TESTING
+
+    -threads <#>      Sets the maximum number of parallel threads to use.default
+                      (on this machine): 88
+
+
+    -batch-size <#>   Process <#> many queries (reads or read pairs) per thread
+                      at once.
+                      default (on this machine): 4096
+
+    -query-limit <#>  Classify at max. <#> queries (reads or read pairs) per
+                      input file.
+                      default: 9223372036854775807
+
+EXAMPLES
+
+    Build database from sequence file 'genomes.fna' and query all sequences in 'myreads.fna':
+        metacache build+query -targets genomes.fna -query myreads.fna
+
+    Build database with latest complete genomes from the NCBI RefSeq and query interactively
+        download-ncbi-genomes refseq/bacteria myfolder
+        download-ncbi-genomes refseq/viruses myfolder
+        download-ncbi-taxonomy myfolder
+        metacache build+query -targets myfolder -taxonomy myfolder
+
+