Skip to content

Commit

Permalink
Revise and optimize index creation
Browse files Browse the repository at this point in the history
  • Loading branch information
yp committed Jun 17, 2020
1 parent e260fe2 commit e71f01c
Show file tree
Hide file tree
Showing 2 changed files with 24 additions and 60 deletions.
74 changes: 19 additions & 55 deletions bloomfilter.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,14 +23,10 @@
#define _BLOOM_FILTER_HPP

#include <algorithm>
#include <array>
#include <sdsl/bit_vectors.hpp>
#include <sdsl/int_vector.hpp>
#include <sdsl/util.hpp>
#include <string>

#include <sys/mman.h>

#include "kmer_utils.hpp"

using namespace std;
Expand All @@ -39,10 +35,6 @@ using namespace sdsl;
class KmerBuilder;
class BloomfilterFiller;

#ifndef SHARK_HUGEPAGESIZE
#define SHARK_HUGEPAGESIZE (2 * 1024 * 1024)
#endif

class BF {
friend class KmerBuilder;
friend class BloomfilterFiller;
Expand All @@ -55,21 +47,14 @@ class BF {
typedef bit_vector_t::rank_1_type rank_t;
typedef vector<int> index_t;
typedef vector<index_t> set_index_t;
typedef int_vector<16> index_kmer_t;
typedef vector<uint16_t> index_kmer_t;
typedef bit_vector_t::select_1_type select_t;

BF(const size_t size) :
_size(size),
_mode(0),
_bf(size, 0)
{
#ifdef MADV_HUGEPAGE
char* const sptr = reinterpret_cast<char*>(_bf.data());
const size_t soffset = SHARK_HUGEPAGESIZE - (reinterpret_cast<size_t>(sptr) % SHARK_HUGEPAGESIZE);
char* const eptr = sptr + (((size + 63) >> 6) << 3);
const size_t eoffset = (reinterpret_cast<size_t>(eptr) % SHARK_HUGEPAGESIZE);
madvise(sptr + soffset, (eptr - sptr) - eoffset - soffset, MADV_HUGEPAGE);
#endif
}

~BF() {}
Expand All @@ -92,33 +77,20 @@ class BF {
return _bf[hash % _size];
}

// Function to add index to a k-mer
bool add_to_kmer(const uint64_t &kmer, const int &input_idx) {
void add_to_kmer(vector<uint64_t> &kmers, const int input_idx) {
if (_mode != 1)
return false;
return;

uint64_t hash = _get_hash(kmer);
size_t bf_idx = hash % _size;
if (_bf[bf_idx]) {
int kmer_rank = _brank(bf_idx);
_set_index[kmer_rank].push_back(input_idx);
return true;
for (auto& kmer: kmers) {
kmer = _get_hash(kmer) % _size;
}
return false;
}

// Function to add multiple indexes to a k-mer
bool multiple_add_to_kmer(const uint64_t &kmer, const vector<int> &idxs) {
// FIXME: can't we just use a single insert (range insertion)? Revert if it's wrong
if (_mode != 1)
return false;
uint64_t hash = _get_hash(kmer);
size_t bf_idx = hash % _size;
if (_bf[bf_idx]) {
sort(kmers.begin(), kmers.end());
for (const auto bf_idx: kmers) {
int kmer_rank = _brank(bf_idx);
_set_index[kmer_rank].insert(idxs.begin(), idxs.begin(), idxs.end());
const auto size = _set_index[kmer_rank].size();
if (size == 0 || _set_index[kmer_rank][size-1] != input_idx)
_set_index[kmer_rank].push_back(input_idx);
}
return true;
}

// Function that returns the indexes of a given k-mer
Expand All @@ -135,9 +107,7 @@ class BF {
size_t bf_idx = hash % _size;
if (_bf[bf_idx]) {
size_t rank_searched = _brank(bf_idx + 1);
if (rank_searched == 1) { // idxs of the first kmer
start_pos = 0;
} else {
if (rank_searched > 1) { // idxs of the first kmer
start_pos = _select_bv(rank_searched - 1) + 1;
}
end_pos = _select_bv(rank_searched);
Expand All @@ -147,7 +117,7 @@ class BF {
// it?
// See also comment in switch_mode regarding dummy index
}
return make_pair(_index_kmer.begin()+start_pos, _index_kmer.begin()+end_pos);
return make_pair(_index_kmer.cbegin()+start_pos, _index_kmer.cbegin()+end_pos);
}

/**
Expand All @@ -170,16 +140,14 @@ class BF {
util::init_support(_brank,&_bf);
size_t num_kmer = _brank(_bf.size());
if (num_kmer != 0)
_set_index.resize(num_kmer, vector<int>());
_set_index.resize(num_kmer, index_t());
return true;
} else if(_mode == 1 and new_mode == 2) {
_mode = new_mode;

// We compute how many idxs we have to store
int tot_idx = 0;
for (auto &set : _set_index) {
sort(set.begin(), set.end());
set.erase(unique(set.begin(), set.end()), set.end());
for (const auto &set : _set_index) {
tot_idx += set.size();
}

Expand All @@ -192,7 +160,7 @@ class BF {
**/
_bv = bit_vector(tot_idx, 0);
int pos = -1;
for (auto &set : _set_index) {
for (const auto &set : _set_index) {
pos += set.size();
_bv[pos] = 1;
}
Expand All @@ -204,20 +172,16 @@ class BF {
* associated to each kmer.
**/
//int_vector<16> tmp_index_kmer(tot_idx); // uncompressed and temporary
_index_kmer = int_vector<16>(tot_idx);
int idx_position = 0;
_index_kmer.resize(tot_idx);
index_kmer_t::iterator ins = _index_kmer.begin();
for (const auto &set : _set_index) {
// FIXME: should we check if the set is empty? Maybe saving a
// dummy index (0)? If so, we cannot use 0 as an index for
// kmers. Maybe -1 is better. Moreover, in this way, the main
// must manage this (it decides the idx). Anyway, I (LD) think
// this can never happen in our context.
// if ( set.size() != 0) {
for (const int &set_element : set) {
//tmp_index_kmer[idx_position] = set_element;
_index_kmer[idx_position] = set_element;
++idx_position;
}
ins = std::copy(set.begin(), set.end(), ins);
// } else { tmp_index_kmer[idx_position] = 0; idx_position++; }
}

Expand Down Expand Up @@ -247,7 +211,7 @@ class BF {
const BF &operator=(const BF &) = delete;
const BF &operator=(const BF &&) = delete;

size_t _size;
const size_t _size;
int _mode;
bit_vector_t _bf;
rank_t _brank;
Expand Down
10 changes: 5 additions & 5 deletions main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,14 +24,11 @@
#include <algorithm>
#include <string>
#include <vector>
#include <unordered_set>

#include <zlib.h>

#include "kseq.h"
KSEQ_INIT(gzFile, gzread)
#include "sdsl/int_vector.hpp"
#include "sdsl/util.hpp"

#include "common.hpp"
#include "argument_parser.hpp"
Expand Down Expand Up @@ -132,7 +129,9 @@ int main(int argc, char *argv[]) {
int nidx = 0;
// open and read the .fa, every time a kmer is found the relative index is
// added to BF
vector<uint64_t> kmers;
while ((seq_len = kseq_read(seq)) >= 0) {
kmers.clear();
string input_name = seq->name.s;
legend_ID.push_back(input_name);

Expand All @@ -141,7 +140,7 @@ int main(int argc, char *argv[]) {
uint64_t kmer = build_kmer(seq->seq.s, _p, opt::k);
if(kmer == (uint64_t)-1) continue;
uint64_t rckmer = revcompl(kmer, opt::k);
bloom.add_to_kmer(min(kmer, rckmer), nidx);
kmers.push_back(min(kmer, rckmer));
for (int p = _p; p < seq_len; ++p) {
uint8_t new_char = to_int[seq->seq.s[p]];
if(new_char == 0) { // Found a char different from A, C, G, T
Expand All @@ -155,8 +154,9 @@ int main(int argc, char *argv[]) {
kmer = lsappend(kmer, new_char, opt::k);
rckmer = rsprepend(rckmer, reverse_char(new_char), opt::k);
}
bloom.add_to_kmer(min(kmer, rckmer), nidx);
kmers.push_back(min(kmer, rckmer));
}
bloom.add_to_kmer(kmers, nidx);
}
++nidx;
}
Expand Down

0 comments on commit e71f01c

Please sign in to comment.