muellan
diff --git a/‎Makefile‎
Lines changed: 16 additions & 11 deletions b/‎Makefile‎
Lines changed: 16 additions & 11 deletions
diff --git a/‎README.md‎
Lines changed: 56 additions & 29 deletions b/‎README.md‎
Lines changed: 56 additions & 29 deletions
diff --git a/‎dep/hpc_helpers/LICENSE‎
Lines changed: 1 addition & 1 deletion b/‎dep/hpc_helpers/LICENSE‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/gpu_version.md‎
Lines changed: 62 additions & 30 deletions b/‎docs/gpu_version.md‎
Lines changed: 62 additions & 30 deletions
diff --git a/‎docs/mode_build.txt‎
Lines changed: 3 additions & 0 deletions b/‎docs/mode_build.txt‎
Lines changed: 3 additions & 0 deletions
@@ -12,18 +12,22 @@ DIALECT       = -std=c++14
 WARNINGS      = -Wall -Wextra -Wpedantic
 NVCC_WARNINGS = -Xcompiler="-Wall -Wextra"
 OPTIMIZATION  = -O3
-INCLUDES      = -lz
-#-march native -fomit-frame-pointer
-# CUB = -I<path-to-cub>
-NVCC_FLAGS    = $(CUB) -arch=sm_70 -lineinfo --expt-relaxed-constexpr --extended-lambda
+INCLUDE       = 
 
-CXXFLAGS      = $(INCLUDES) $(MACROS) $(DIALECT) $(WARNINGS)
+NVCC_FLAGS    = $(CUB) -arch=$(CUDA_ARCH) -lineinfo --expt-relaxed-constexpr --extended-lambda
+CXXFLAGS      = $(INCLUDE) $(MACROS) $(DIALECT) $(WARNINGS)
 
-CUDA_FLAGS    = $(NVCC_FLAGS) $(INCLUDES) $(MACROS) $(DIALECT) $(NVCC_WARNINGS)
+LDFLAGS       = -pthread
 
-LDFLAGS       = -pthread $(INCLUDES)
+CUDA_FLAGS    = $(NVCC_FLAGS) $(INCLUDE) $(MACROS) $(DIALECT) $(NVCC_WARNINGS)
+CUDA_LDFLAGS  = $(NVCC_FLAGS) -Xcompiler="-pthread"
 
-CUDA_LDFLAGS  = $(NVCC_FLAGS) $(INCLUDES) -Xcompiler="-pthread"
+# if MC_ZLIB=NO => deactivate zlib support
+ifeq ($(MC_ZLIB),NO)
+LDFLAGS += -lz
+CUDA_LDFLAGS += -lz
+MACROS += -DMC_NO_ZLIB
+endif
 
 
 #--------------------------------------------------------------------
@@ -125,8 +129,6 @@ CUDA_COMPILE      = $(CUDA_COMPILER) $(CUDA_FLAGS) -c $< -o $@
 #--------------------------------------------------------------------
 # main targets
 #--------------------------------------------------------------------
-.PHONY: all clean
-
 release:
 	$(MAKE) release_dummy DIR=$(REL_DIR) ARTIFACT=$(REL_ARTIFACT) MACROS=$(MACROS)
 
@@ -146,7 +148,6 @@ release_dummy: $(REL_DIR) $(REL_ARTIFACT)
 debug_dummy:   $(DBG_DIR) $(DBG_ARTIFACT)
 profile_dummy: $(PRF_DIR) $(PRF_ARTIFACT)
 
-
 gpu_release:
 	$(MAKE) gpu_release_dummy DIR=$(REL_CUDA_DIR) CUDA_ARTIFACT=$(REL_CUDA_ARTIFACT) MACROS="$(MACROS) -DGPU_MODE"
 
@@ -173,7 +174,11 @@ gpu_debug_dummy:   $(DBG_CUDA_DIR) $(DBG_CUDA_ARTIFACT)
 gpu_profile_dummy: $(PRF_CUDA_DIR) $(PRF_CUDA_ARTIFACT)
 
 
+# phony targets
+.PHONY: all clean gpu cpu
 all: release debug profile
+cpu: release
+gpu: gpu_release
 
 clean :
 	rm -rf build_*
 
@@ -6,22 +6,29 @@ MetaCache is a classification system for mapping genomic sequences (short reads,
 
 For an independend comparison to other tools in terms of classification accuracy see the [LEMMI](https://lemmi.ezlab.org) benchmarking site.
 
-MetaCache's CPU version classifies around 60 Million reads (of length 100) per minute against all complete bacterial, viral and archaea genomes from NCBI RefSeq Release 97 running with 88 threads on a workstation with 2 Intel(R) Xeon(R) Gold 6238 CPUs.
+**MetaCache's CPU** version classifies around 60 Million reads (of length 100) per minute against all complete bacterial, viral and archaea genomes from NCBI RefSeq Release 97 running with 88 threads on a workstation with 2 Intel(R) Xeon(R) Gold 6238 CPUs.
 
-MetaCache's [GPU version](docs/gpu_version.md) classifies around 300 Million reads (of length 100) per minute against all complete bacterial, viral, fungal and archaea genomes from NCBI RefSeq Release 202 running on a workstation with 4 NVIDIA(R) Tesla(R) V100 GPUs (32 GB model).
+**MetaCache's [GPU version](docs/gpu_version.md)** classifies around 300 Million reads (of length 100) per minute against all complete bacterial, viral, fungal and archaea genomes from NCBI RefSeq Release 202 running on a workstation with 4 NVIDIA(R) Tesla(R) V100 GPUs (32 GB model).
 
 
 
 
 ## Quick Start with NCBI RefSeq
-This will download MetaCache, compile it, download the complete bacterial, viral and archaea genomes from the latest NCBI RefSeq release (this can take some time) and build a classification database from them:
+on a Debian/Ubuntu system:
 
 ```
+sudo apt install -y zlib1g zlib1g-dev
 git clone https://github.com/muellan/metacache.git
 cd metacache
 make
 ./metacache-build-refseq
 ```
+This will
+  * install the zlib library  
+  * download the MetaCache source code from GitHub
+  * compile MetaCache (without GPU support)
+  * download the complete bacterial, viral and archaea genomes from the latest NCBI RefSeq release (this can take some time) 
+  * build a classification database
 
 Once the default database is built you can classify reads:
   ```
@@ -36,56 +43,69 @@ Once the default database is built you can classify reads:
 
 ## Detailed Installation Instructions
 
-#### Requirements
-MetaCache itself should compile on any platform for which a C++14 conforming compiler is available. The Makefile is written with g++ or clang++ in mind, but could probably be adapted to MSVC or other compilers.
+Visit MetaCache's github [repository] to get the latest resources.
+
+* To compile the CPU version: run `make` in the directory containing the Makefile
+* To compile the GPU version, follow the instructions provided [here](docs/gpu_version.md).
+
+
+### CPU Version Requirements
+
+MetaCache itself should compile on any platform for which a C++14 conforming compiler is available. The Makefile is written with g++ or clang++ in mind, but could probably be adapted to (a very recent version of) MSVC or other compilers.
 
 The helper scripts (for downloading genomes, taxonomy etc.) require the Bash shell to run. That means you need a working bash executable as well as some common GNU utilities like "awk" and "wget". On Windows you should use the 'Windows Subsystem for Linux' (which gives you an Ubuntu user mode talking to the Windows Kernel).
 
-There are no dependencies on third party libraries.
-MetaCache was successfully tested on the following platforms (all 64 bit + 64 bit compilers):
-- Ubuntu 14.04 with g++ 5.4
-- Ubuntu 16.04 with g++ 5.3, g++ 7.2
-- Ubuntu 18.04 with g++ 5.4, g++ 7.4
-- Windows 10 Build 1709 64bit with MinGW-w64 g++ 7.2
-- Windows 10 Build 1909 64bit running Ubuntu 16.04 inside WSL and g++ 7.2
+MetaCache 2.0.0 was successfully tested on the following platforms (all 64 bit + 64 bit compilers):
+- Ubuntu 20.04 with g++ 5.4, g++ 7.4
+- Windows 10 20H2 running Ubuntu 20.04 inside WSL2 and g++ 10.3
 
 In order to be able to build the default database (based on NCBI RefSeq Release 97) with default settings your system should have around 64GB of RAM (note that the NCBI RefSeq will still be growing in the near future).
 If you don't have enough RAM, you can use [database partitioning](docs/partitioning.md).
 
-#### Get The Latest Sources
-Visit MetaCache's github [repository].
 
+### GPU Version Requirements
+The GPU version requires a CUDA-capable device of the Pascal generation or newer and either CUDA >= 11 or CUDA 10.2 and a self-provided version of [CUB](https://github.com/NVlabs/cub).
 
-#### Compile
-Run 'make' in the directory containing the Makefile.
-This will compile MetaCache with the default data type settings which support databases with up to 65,535 reference sequences (targets) and k-mer sizes up to 16. This offers a good database space efficiency and is currently sufficient for the complete bacterial, viral and archaea genomes from the NCBI RefSeq.
+See [here](docs/gpu_version.md) for more.
 
-If you want MetaCache to be able to process gzipped files make sure you have the zlib library installed on your system and compile with:
 
+### Library Requirements (CPU & GPU versions)
+MetaCache requires the zlib compression library to be installed on your system in order to be able to process gzipped FASTA/FASTQ files.
+On Debian/Ubuntu zlib can be installed with
   ```
-  make MACROS="-DMC_ZLIB"
+  sudo apt install -y zlib1g zlib1g-dev
   ```
+If you *don't* have zlib installed or cannot do so you can compile with:
+  ```
+  make MC_ZLIB=NO
+  ```
+which will remove the zlib dependency and disables support for gzipped input files.
+
 
-Using the following compilation options you can compile MetaCache with support for more reference sequences and greater k-mer lengths.
+### Custom Configurations
 
-##### number of referece sequences (targets)
+If you run 'make' without additional parameters MetaCache will be compiled with the default data type settings which support databases with up to 65,535 reference sequences (targets) and k-mer sizes up to 16. This offers a good database space efficiency and is currently sufficient for the complete bacterial, viral and archaea genomes from the NCBI RefSeq.
 
-* support for up to 65,535 reference sequences (default):
+Using the following compilation options you can compile MetaCache with support for more targets and greater k-mer lengths.
+
+#### number of referece sequences (targets)
+
+* support for up to 65,535 targets (default):
   ```
   make MACROS="-DMC_TARGET_ID_TYPE=uint16_t"
   ```
 
-* support for up to 4,294,967,295 reference sequences (needs more memory):
+* support for up to 4,294,967,295 targets (needs more memory):
   ```
   make MACROS="-DMC_TARGET_ID_TYPE=uint32_t"
   ```
 
-* support for more than 4,294,967,295 reference sequences (needs even more memory)
+* support for more than 4,294,967,295 targets (needs even more memory)
   ```
   make MACROS="-DMC_TARGET_ID_TYPE=uint64_t"
   ```
 
-##### reference sequence lenghts
+#### reference sequence lenghts
 * support for targets up to a length of 4,294,967,295 windows (default)
   with default settings (window length, k-mer size) no sequence length must exceed 485.3 billion nucleotides
   ```
@@ -98,8 +118,7 @@ Using the following compilation options you can compile MetaCache with support f
   make MACROS="-DMC_WINDOW_ID_TYPE=uint16_t"
   ```
 
-
-##### kmer lengths
+#### kmer lengths
 * support for kmer lengths up to 16 (default):
   ```
   make MACROS="-DMC_KMER_TYPE=uint32_t"
@@ -112,14 +131,21 @@ Using the following compilation options you can compile MetaCache with support f
 
 You can of course combine these options (don't forget the surrounding quotes):
   ```
-  make MACROS="-DMC_ZLIB -DMC_TARGET_ID_TYPE=uint32_t -DMC_WINDOW_ID_TYPE=uint32_t"
+  make MACROS="-DMC_TARGET_ID_TYPE=uint32_t -DMC_WINDOW_ID_TYPE=uint32_t"
   ```
 
 **Note that a database can only be queried with the same variant of MetaCache (regarding data type sizes) that it was built with.**
 
 In rare cases databases built on one platform might not work with MetaCache on other platforms due to bit-endianness and data type width differences. Especially mixing MetaCache executables compiled with 32-bit and 64-bit compilers might be probelematic.
 
 
+#### disabling zlib support
+
+If you *don't* have the zlib compression library installed and/or want *don't* want gzipped input file support you can compile with:
+  ```
+  make MC_ZLIB=NO
+  ```
+
 
 
 ## Building Databases
@@ -160,8 +186,9 @@ Once a database (e.g. the standard 'refseq'), is built you can classify reads.
 
 ## Documentation of Command Line Parameters
 
-* [for mode `build`](docs/mode_build.txt): build database from reference genomes
+* [for mode `build`](docs/mode_build.txt): build database from reference genomes (and write it to disk)
 * [for mode `query`](docs/mode_query.txt): query reads against database
+* [for mode `build+query`](docs/mode_build_query.txt): build reference database and immediately query reads (mainly recommended for GPU version)
 * [for mode `merge`](docs/mode_merge.txt): merge results of independent queries
 * [for mode `modify`](docs/mode_modify.txt): add reference genomes to database or update taxonomy
 * [for mode `info`](docs/mode_info.txt): obtain information about a database
 
@@ -1,6 +1,6 @@
 MIT License
 
-Copyright (c) 2020 Parallel and Distributed Architectures
+Copyright (c) 2021 Parallel and Distributed Architectures
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
 
@@ -1,69 +1,101 @@
+
 # MetaCache-GPU
 
-## Installation Instructions
 
-#### Requirements
+## Example Installation 
+on an Ubuntu system with NVIDIA Quattro GV100 GPUs and CUDA SDK version 11 installed:
+```
+sudo apt install -y zlib1g zlib1g-dev
+git clone https://github.com/muellan/metacache.git
+cd metacache
+git submodule update --init --recursive
+make gpu CUDA_ARCH=sm_70
+```
+See below for more details.
 
-The GPU version of MetaCache requires a CUDA-capable device of the Pascal generation or newer and either:
 
-* CUDA >= 11
-* CUDA 10.2 and a self-provided version of [CUB](https://github.com/NVlabs/cub)
 
-Make sure to adjust the Makefile to the GPU generation you want to use by setting the `-arch` flag (e.g. `-arch=sm_70` for Quadro GV100). You also have to set the include path for CUB if your CUDA version is below CUDA 11.
+## Requirements
 
-MetaCache-GPU depends on the hashtable implementation of [warpcore](https://github.com/sleeepyjack/warpcore) and the sorting algorithm [bb_segsort](https://github.com/Funatiq/bb_segsort). Both repositories are included as submodules and need to be checked out in addition to MetaCache itself. You can do so be calling
+### Hardware Requirements
 
-```git submodule update --init --recursive```
+The GPU version of MetaCache requires a CUDA-capable device of the Pascal generation or newer.
 
-In order to be able to build the default database (based on NCBI RefSeq Release 97) with default settings your system will need a total of 120 GB of GPU memory (e.g. 4x GPUs with 32 GB each).
+In order to be able to build the default database (based on NCBI RefSeq Release 97) with default settings your system will need a total of 120 GB of  GPU memory (e.g. 4x GPUs with 32 GB each).
 If you don't have enough GPU memory, you can use [database partitioning](docs/partitioning.md).
 
-#### Compile
-Run '`make gpu_release`' in the directory containing the Makefile.
-This will compile MetaCache-GPU with support for:
 
-* up to 4,294,967,295 reference sequences
-* targets up to a length of 4,294,967,295 windows
-* kmer lengths up to 16
+### Software Dependencies
+
+* CUDA SDK
+  * CUDA >= 11
+  * CUDA 10.2 and a self-provided version of [CUB](https://github.com/NVlabs/cub) (you also need to set the include path for CUB by supplying `INCLUDE=*your_cub_path*` when calling make)
+
+* Hashtable library [warpcore](https://github.com/sleeepyjack/warpcore) and sorting library [bb_segsort](https://github.com/Funatiq/bb_segsort). Both repositories are included as submodules and need to be checked out in addition to MetaCache itself. You can do so by calling
+  ```git submodule update --init --recursive```
+
+* Support for gzipped FASTA/FASTQ files requires the zlib compression library to be installed on your system.
+  On Debian/Ubuntu zlib can be installed with
+  `sudo apt install -y zlib1g zlib1g-dev`. If you *don't* have zlib installed or cannot do so you can compile with `make MC_ZLIB=NO`
+  which will remove the zlib dependency and disables support for gzipped input files.
+
+
+## Installation / Compiling
+
+Run `make` in the directory containing the Makefile and set the GPU generation with the `CUDA_ARCH` flag (e.g. `CUDA_ARCH=sm_70` for Quadro GV100):
+```
+make gpu CUDA_ARCH=sm_70
+```
+
+If you don't supply additional parameters MetaCache will be compiled with the default data type settings which support databases with
+
+* up to 4,294,967,295 targets (= reference sequences)
+* targets with a length of up to 4,294,967,295 windows (which corresponds to approximately 485.3 billion nucleotides with the default window size of 112)
+* kmers with a lengths of up to 16
 
 This corresponds to the CPU version compiled with `make MACROS="-DMC_TARGET_ID_TYPE=uint32_t"`
 
-**Note that a database build by the GPU version can be queried by the corresponding CPU version and vice versa. The only restriction is the available (GPU) memory.**
+**A database built by the GPU version can be queried by the corresponding CPU version and vice versa. The only restriction is the available (GPU) memory.**
+
 
 
 ## Differences to CPU version
 
 MetaCache-GPU allows to **build** distributed databases across multiple GPUs.
-In difference to the [database partitioning](docs/partitioning.md) approach, the program distributes the reference genomes automatically across the GPUs in a single run. Due to the dynamic distribution scheme and the concurrent execution on the GPUs, two database builds for the same input files will most likely differ. However, this should have only a small impact on classification performance.
+In difference to the [database partitioning](docs/partitioning.md) approach, the reference genomes are automatically distributed across multiple GPUs in a single run. Due to the dynamic distribution scheme and the concurrent execution on the GPUs, two database builds for the same input files will most likely differ. However, this should only have a negligible impact on classification performance.
+
+In order to **query** a multi-GPU database make sure to set the same number of GPUs when using the query mode. 
+
+### Build+Query Immediate Mode
+Since building databases is significantly faster on the GPU than on the CPU and will often take less than a minute, the [build+query mode](docs/mode_build_query.txt) can be used to build and directly query a database without writing the database to disk.
 
-In order to **query** a multi-GPU database make sure to set the same number of GPUs when using the query mode. Note, that only a small number of threads is needed to saturate the GPU query pipeline.
 
-#### Command Line Options
+### Command Line Options
 
 The command line options of the GPU version are similar to the CPU version with a few notable exceptions:
 
-##### mode build
+#### mode build
 
 * `-parts <#>` sets the number of GPUs to use (default: all available GPUs).
 
-##### mode query
+#### mode query
 
 * `-replicate <#>` enables multiple GPU pipelines (default: 1). Each pipeline occupies one GPU per database part.
 
-##### mode build & mode query
+#### mode build & mode query
 
 * `-kmerlen` kmer length is limited to 16 (default: 16).
 * `-sketchlen` sketch length is limited to 16 (default: 16).
 * `-winlen` window length is limited to 127 (default: 127).
-* `-winstride` window stride has to be multiple of 4 (default: 112).
-* `-remove-overpopulated-features` is not supported.
-* `-remove-ambig-features` is not supported.
+* `-winstride` window stride has to be a multiple of 4 (default: 112).
+* `-remove-overpopulated-features` is *not* supported.
+* `-remove-ambig-features` is *not* supported.
 
-##### mode info
+#### mode info
 
-* feature map is not available.
-* feature counts are not available.
+* submode `locations`is *not* available.
+* submode `featurecounts` is *not* available.
 
-##### mode merge
+#### mode merge
 
-* merging on GPU is not available and will fall back to CPU version.
+Merging multiple result files will *not* be performed on the GPU and will fall back to the CPU.
@@ -93,18 +93,21 @@ ADVANCED OPTIONS
                       family, suborder, order, subclass, class, subphylum,
                       phylum, subkingdom, kingdom, domain
                       default: off
+                      Not available in the GPU version.
 
     -max-ambig-per-feature <#>
                       Maximum number of allowed different reference sequence
                       taxa per feature if option '-remove-ambig-features' is
                       used.
+                      Not available in the GPU version.
 
     -max-load-fac <factor>
                       maximum hash table load factor;
                       This can be used to trade off larger memory consumption
                       for speed and vice versa. A lower load factor will improve
                       speed, a larger one will improve memory efficiency.
                       default: 0.800000
+                      Not available in the GPU version.
 
     -parts <#>        Splits the database into multiple parts. Each part
                       contains a separate hash table.