Skip to content

Commit 03e7f4a

Browse files
committed
Squashed commit of the following:
commit 70409de0e2af0a16a7136bab31086ef7eab04d9b Author: Benjamin Buchfink <[email protected]> Date: Tue Jun 20 09:25:18 2023 +0200 Updated version. commit 4099a6d4e2d61fa0ca32e1ea2935bf7fda871601 Author: Benjamin Buchfink <[email protected]> Date: Mon Jun 19 17:03:26 2023 +0200 Added alias taxid. commit 38b77f968b2013944480e4c855be7ee8f6fe6c57 Author: Benjamin Buchfink <[email protected]> Date: Mon Jun 19 16:10:41 2023 +0200 Added qlen for sam format. commit 9db6be8ccd7293d501f4640cf317c72c0532c926 Author: Benjamin Buchfink <[email protected]> Date: Tue Jun 13 16:57:07 2023 +0200 Delete iostream commit f91883fac79b43c29989b3091dec6d981457164b Author: Benjamin Buchfink <[email protected]> Date: Tue Jun 13 12:53:17 2023 +0200 Use array for root. commit 2d9667af766c8379b1f734fd6263ab8ece2efd2a Author: Benjamin Buchfink <[email protected]> Date: Tue Jun 13 12:53:10 2023 +0200 Use array as root. commit 95023132611bf4dfa2667ea172a2aa5542a7cabe Author: emile151 <[email protected]> Date: Tue Jun 13 12:25:34 2023 +0200 Dev (#15) * json format reworked in blast_tab * json format reworked in blast_tab * json format reworked in blast_tab * json format reworked in blast_tab * json format reworked in blast_tab * json format reworked in blast_tab * json format reworked in blast_tab (comments fixed) * json format reworked in blast_tab (comments fixed) * json format reworked in blast_tab (comments fixed) * json format reworked in blast_tab (comments fixed) * json format reworked in blast_tab (comments fixed) * json format reworked in blast_tab (comments fixed) * json format reworked in blast_tab (comments fixed) * json format reworked in blast_tab (comments fixed) * Merge conflcit resolved * json format reworked in blast_tab (comments fixed) commit 608bc06a082c528eecdb2642205208906f1a756d Author: Benjamin Buchfink <[email protected]> Date: Tue Jun 13 11:23:31 2023 +0200 Added seed masking for --algo 1. commit aac15b8ba2489c76c8810714ae572f3e3278a1d8 Author: Benjamin Buchfink <[email protected]> Date: Mon Jun 12 17:23:31 2023 +0200 Filter low complex seeds in enum_seeds. commit ddca6fa9cce1eb65d17893930d969b8bd6228e1c Merge: 5c10a7dd 14f3550 Author: Benjamin Buchfink <[email protected]> Date: Wed Jun 7 09:33:45 2023 +0200 Merge branch 'master' into dev
1 parent 14f3550 commit 03e7f4a

30 files changed

+266
-151
lines changed

.gitignore

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -54,5 +54,5 @@ Makefile
5454
diamond
5555
src/extra/
5656
.vs/
57-
CMakeSettings.json
5857
.unison
58+
/CMakeSettings.json

src/ChangeLog

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,9 @@
1+
[2.1.8]
2+
- Fixed an issue that could cause reduced performance when running in
3+
query-indexed mode.
4+
- Added support for the JSON output format (option `-f json-flat`).
5+
- Added the option `--sam-query-len` to output query length in SAM format.
6+
17
[2.1.7]
28
- Fixed a bug that caused taxonomy names not to be loaded correctly for the
39
`makedb` workflow.

src/align/align.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -326,7 +326,7 @@ void align_queries(Consumer* output_file, Search::Config& cfg)
326326

327327
timer.go("Computing alignments");
328328
HitIterator hit_it(query_range.first, query_range.second, hit_buf->data(), hit_buf->data() + hit_buf->size());
329-
OutputWriter writer{ output_file };
329+
OutputWriter writer{output_file, (*cfg.output_format == OutputFormat::json) ? ',' : char(0)};
330330
output_sink.reset(new ReorderQueue<TextBuffer*, OutputWriter>(query_range.first, writer));
331331
unique_ptr<thread> heartbeat;
332332
if (config.verbosity >= 3 && config.load_balancing == Config::query_parallel && !config.swipe_all && config.heartbeat)

src/align/output.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
2626
#include "../output/daa/daa_write.h"
2727
#include "../util/sequence/sequence.h"
2828

29+
2930
using std::vector;
3031

3132
namespace Extension {
@@ -51,7 +52,7 @@ TextBuffer* generate_output(vector<Match> &targets, const Extension::Stats& stat
5152
}
5253
else if (aligned || config.report_unaligned)
5354
f->print_query_intro(info);
54-
55+
5556
for (int i = 0; i < (int)targets.size(); ++i) {
5657

5758
if (targets[i].hsp.empty())

src/basic/basic.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ along with this program. If not, see <http://www.gnu.org/licenses/>.
2929
#include "../util/util.h"
3030
#include "../stats/standard_matrix.h"
3131

32-
const char* Const::version_string = "2.1.7";
32+
const char* Const::version_string = "2.1.8";
3333
using std::string;
3434
using std::vector;
3535
using std::count;

src/basic/config.cpp

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -315,8 +315,9 @@ Config::Config(int argc, const char **argv, bool check_io, CommandLineParser& pa
315315
\t100 = DIAMOND alignment archive (DAA)\n\
316316
\t101 = SAM\n\
317317
\t102 = Taxonomic classification\n\
318-
\t103 = PAF\n\n\
319-
\tValue 6 may be followed by a space-separated list of these keywords:\n\n\
318+
\t103 = PAF\n\
319+
\t104 = JSON (flat)\n\n\
320+
\tValues 6 and 104 may be followed by a space-separated list of these keywords:\n\n\
320321
\tqseqid means Query Seq - id\n\
321322
\tqlen means Query sequence length\n\
322323
\tsseqid means Subject Seq - id\n\
@@ -422,11 +423,12 @@ Config::Config(int argc, const char **argv, bool check_io, CommandLineParser& pa
422423
("multiprocessing", 0, "enable distributed-memory parallel processing", multiprocessing)
423424
("mp-init", 0, "initialize multiprocessing run", mp_init)
424425
("mp-recover", 0, "enable continuation of interrupted multiprocessing run", mp_recover)
425-
("mp-query-chunk", 0, "process only a single query chunk as specified", mp_query_chunk, -1)
426+
("mp-query-chunk", 0, "process only a single query chunk as specified", mp_query_chunk, -1)
426427
("culling-overlap", 0, "minimum range overlap with higher scoring hit to delete a hit (default=50%)", inner_culling_overlap, 50.0)
427428
("taxon-k", 0, "maximum number of targets to report per species", taxon_k, (uint64_t)0)
428-
("range-cover", 0, "percentage of query range to be covered for range culling (default=50%)", query_range_cover, 50.0)
429+
("range-cover", 0, "percentage of query range to be covered for range culling (default=50%)", query_range_cover, 50.0)
429430
("xml-blord-format", 0, "Use gnl|BL_ORD_ID| style format in XML output", xml_blord_format)
431+
("sam-query-len", 0, "add the query length to the SAM format (tag ZQ)", sam_qlen_field)
430432
("stop-match-score", 0, "Set the match score of stop codons against each other.", stop_match_score, 1)
431433
("target-indexed", 0, "Enable target-indexed mode", target_indexed)
432434
("unaligned-targets", 0, "", unaligned_targets)
@@ -669,7 +671,7 @@ Config::Config(int argc, const char **argv, bool check_io, CommandLineParser& pa
669671

670672
if (verbosity >= 1 || command == regression_test) {
671673
ostream& header_out = command == Config::help ? cout : cerr;
672-
header_out << Const::program_name << " v" << Const::version_string << "." << (unsigned)Const::build_version << " (C) Max Planck Society for the Advancement of Science" << endl;
674+
header_out << Const::program_name << " v" << Const::version_string << "." << (unsigned)Const::build_version << " (C) Max Planck Society for the Advancement of Science, Benjamin Buchfink, University of Tuebingen" << endl;
673675
header_out << "Documentation, support and updates available at http://www.diamondsearch.org" << endl;
674676
header_out << "Please cite: http://dx.doi.org/10.1038/s41592-021-01101-x Nature Methods (2021)" << endl << endl;
675677
}

src/basic/config.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -339,6 +339,7 @@ struct Config
339339
int zdrop;
340340
bool heartbeat;
341341
bool no_parse_seqids;
342+
bool sam_qlen_field;
342343

343344
SequenceType dbtype;
344345

src/basic/const.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ struct Const
2525
{
2626

2727
enum {
28-
build_version = 161,
28+
build_version = 162,
2929
#ifdef SINGLE_THREADED
3030
seedp_bits = 0,
3131
#else

src/basic/seed_iterator.h

Lines changed: 24 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -159,31 +159,39 @@ struct SketchIterator
159159
template<uint64_t B>
160160
struct HashedSeedIterator
161161
{
162-
HashedSeedIterator(const Sequence &seq, const Shape &sh):
163-
ptr_(seq.data()),
164-
end_(ptr_ + seq.length()),
162+
HashedSeedIterator(Letter* seq, Loc len, const Shape &sh):
163+
long_mask(sh.long_mask()),
164+
ptr_(seq),
165+
end_(ptr_ + len),
165166
last_(0)
166167
{
167-
for (int i = 0; (i < sh.length_ - 1) && ptr_ < end_; ++i) {
168+
for (int i = 0; i < sh.length_ && ptr_ < end_; ++i)
168169
last_ = (last_ << B) | Reduction::reduction(letter_mask(*(ptr_++)));
169-
}
170170
}
171171
bool good() const
172172
{
173-
return ptr_ < end_;
173+
return ptr_ <= end_;
174174
}
175-
bool get(uint64_t &seed, uint64_t mask)
176-
{
177-
last_ <<= B;
178-
const Letter l = letter_mask(*(ptr_++));
179-
if (!is_amino_acid(l))
180-
return false;
181-
last_ |= Reduction::reduction(l);
182-
seed = MurmurHash()(last_ & mask);
183-
return true;
175+
uint64_t operator*() const {
176+
return MurmurHash()(last_ & long_mask);
177+
}
178+
HashedSeedIterator& operator++() {
179+
while (ptr_ < end_) {
180+
last_ <<= B;
181+
const Letter l = letter_mask(*(ptr_++));
182+
if (!is_amino_acid(l))
183+
continue;
184+
last_ |= Reduction::reduction(l);
185+
return *this;
186+
}
187+
++ptr_;
188+
}
189+
Letter* seq_ptr(const Shape& sh) const {
190+
return ptr_ - sh.length_;
184191
}
185192
private:
186-
const Letter *ptr_, *end_;
193+
const uint64_t long_mask;
194+
Letter *ptr_, *end_;
187195
uint64_t last_;
188196
};
189197

src/data/enum_seeds.h

Lines changed: 8 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,6 @@ Search::SeedStats enum_seeds_minimizer(SequenceSet* seqs, F* f, unsigned begin,
7171
template<typename F, uint64_t BITS, typename Filter>
7272
void enum_seeds_hashed(SequenceSet* seqs, F* f, unsigned begin, unsigned end, const Filter* filter, const EnumCfg& cfg)
7373
{
74-
uint64_t key;
7574
for (unsigned i = begin; i < end; ++i) {
7675
if (cfg.skip && (*cfg.skip)[i / align_mode.query_contexts])
7776
continue;
@@ -80,16 +79,16 @@ void enum_seeds_hashed(SequenceSet* seqs, F* f, unsigned begin, unsigned end, co
8079
for (size_t shape_id = cfg.shape_begin; shape_id < cfg.shape_end; ++shape_id) {
8180
const Shape& sh = shapes[shape_id];
8281
if (seq.length() < sh.length_) continue;
83-
const uint64_t shape_mask = sh.long_mask();
8482
//const __m128i shape_mask = sh.long_mask_sse_;
85-
HashedSeedIterator<BITS> it(seq, sh);
86-
Loc j = 0;
83+
HashedSeedIterator<BITS> it(seqs->ptr(i), seqs->length(i), sh);
8784
while (it.good()) {
88-
if (it.get(key, shape_mask)) {
89-
if (filter->contains(key, shape_id))
90-
(*f)(key, seqs->position(i, j), i, shape_id);
91-
}
92-
++j;
85+
const uint64_t key = *it;
86+
if (filter->contains(key, shape_id))
87+
if (!cfg.filter_low_complexity_seeds || Search::seed_is_complex(it.seq_ptr(sh), sh, cfg.seed_cut))
88+
(*f)(key, seqs->position(i, it.seq_ptr(sh) - seq.data()), i, shape_id);
89+
else if (cfg.mask_low_complexity_seeds)
90+
*it.seq_ptr(sh) |= SEED_MASK;
91+
++it;
9392
}
9493
}
9594
}

0 commit comments

Comments
 (0)