Skip to content

Commit

Permalink
Merge pull request #218 from rhpvorderman/issue199
Browse files Browse the repository at this point in the history
Add read splitting information to the report.
  • Loading branch information
rhpvorderman authored Dec 13, 2024
2 parents 8bad37a + 984921e commit 7047a51
Show file tree
Hide file tree
Showing 5 changed files with 269 additions and 90 deletions.
3 changes: 3 additions & 0 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,9 @@ version 0.13.0-dev
------------------
+ Python 3.13 support was added.
+ Python 3.8 and 3.9 are no longer supported.
+ Add a plot show how many reads originate from read splitting. This primarily
serves to notify the user that chimeric read splitting has already been
performed by dorado.
+ Add a N50 and N90 statistic to the sequence length distribution report.
+ Allow proper judging of aligned BAM files as input data by ignoring any
secondary or supplementary alignment records. This is equivalent to running
Expand Down
1 change: 1 addition & 0 deletions src/sequali/_qc.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,7 @@ class NanoporeReadInfo:
length: int
cumulative_error_rate: float
duration: float
parent_id_hash: int

class NanoStats:
number_of_reads: int
Expand Down
282 changes: 204 additions & 78 deletions src/sequali/_qcmodule.c
Original file line number Diff line number Diff line change
Expand Up @@ -4479,6 +4479,7 @@ struct NanoInfo {
int32_t channel_id;
uint32_t length;
double cumulative_error_rate;
uint64_t parent_id_hash;
};

typedef struct {
Expand Down Expand Up @@ -4512,6 +4513,12 @@ NanoporeReadInfo_get_duration(NanoporeReadInfo *self, void *closure)
return PyFloat_FromDouble((double)self->info.duration);
}

static PyObject *
NanoporeReadInfo_get_parent_id_hash(NanoporeReadInfo *self, void *closure)
{
return PyLong_FromUnsignedLongLong(self->info.parent_id_hash);
}

static PyGetSetDef NanoporeReadInfo_properties[] = {
{"start_time", (getter)NanoporeReadInfo_get_start_time, NULL,
"unix UTC timestamp for start time", NULL},
Expand All @@ -4521,6 +4528,8 @@ static PyGetSetDef NanoporeReadInfo_properties[] = {
{"cumulative_error_rate", (getter)NanoporeReadInfo_get_cumulative_error_rate,
NULL, "sum off all the bases' error rates.", NULL},
{"duration", (getter)NanoporeReadInfo_get_duration, NULL, NULL, NULL},
{"parent_id_hash", (getter)NanoporeReadInfo_get_parent_id_hash, NULL, NULL,
NULL},
{NULL},
};

Expand Down Expand Up @@ -4717,93 +4726,205 @@ NanoInfo_from_header(const uint8_t *header, size_t header_length,
return 0;
}

static Py_ssize_t
get_tag_int_value(const uint8_t *tag)
{
uint8_t tag_type = tag[2];
const uint8_t *value_start = tag + 3;
switch (tag_type) {
case 'c':
return ((int8_t *)value_start)[0];
case 'C':
return ((uint8_t *)value_start)[0];
case 's':
return ((int16_t *)value_start)[0];
case 'S':
return ((uint16_t *)value_start)[0];
case 'i':
return ((int32_t *)value_start)[0];
case 'I':
return ((uint32_t *)value_start)[0];
default:
return PY_SSIZE_T_MIN;
}
}

static Py_ssize_t
tag_length(const uint8_t *tag, size_t maximum_tag_length)
{
if (maximum_tag_length < 4) {
PyErr_SetString(PyExc_ValueError, "truncated tags");
return -1;
}
uint8_t tag_type = tag[2];
const uint8_t *value_start = tag + 3;
size_t value_length;
bool is_array = false;
uint32_t array_length = 1;
if (tag_type == 'B') {
is_array = true;
value_start = tag + 8;
tag_type = tag[3];
if (maximum_tag_length < 8) {
PyErr_SetString(PyExc_ValueError, "truncated tags");
return -1;
}
array_length = *(uint32_t *)(tag + 4);
};

switch (tag_type) {
case 'A':
case 'c':
case 'C':
value_length = 1;
break;
case 's':
case 'S':
value_length = 2;
break;
case 'I':
case 'i':
case 'f':
value_length = 4;
break;
case 'Z':
case 'H':
if (is_array) {
PyErr_Format(PyExc_ValueError, "Invalid type for array %c",
tag_type);
return -1;
}
uint8_t *string_end = memchr(value_start, 0, maximum_tag_length - 3);
if (string_end == NULL) {
PyErr_SetString(PyExc_ValueError, "truncated tags");
return -1;
}
value_length =
(string_end - value_start) + 1; // +1 for terminating null
break;
default:
PyErr_Format(PyExc_ValueError, "Unknown tag type %c", tag_type);
return -1;
}
size_t this_tag_length = (value_start - tag) + array_length * value_length;
if (this_tag_length > maximum_tag_length) {
PyErr_SetString(PyExc_ValueError, "truncated tags");
return -1;
}
return this_tag_length;
}

struct TagInfo {
int32_t channel_id;
float duration;
time_t start_time;
uint64_t parent_id_hash;
};

/**
* @brief "Hash" a uuid4 by using the first 8 digits and last 8 digits for
* 64 random bits. Return 0 on error.
*/
static uint64_t
uuid4_hash(char *uuid)
{
/* UUID4 takes the form of,
xxxxxxxx-xxxx-Mxxx-Nxxx-xxxxxxxxxxxx
^^^^^^^^ ^^^^^^^^
These hexadecimal digits are used, 16 in total. M should be 4. N can
be 8,9,A,B,C,D. The rest of the digits are andom.
*/
if (uuid[8] != '-' || uuid[13] != '-' ||
uuid[14] != '4' || // UUID version 4 check.
uuid[18] != '-' || uuid[23] != '-' || uuid[36] != 0) {
return 0;
}
char *end_ptr = uuid;
uint64_t first_bit = strtoull(uuid, &end_ptr, 16);
if (end_ptr - uuid != 8) {
// strtoull stops at first non-hexadecimal. This should be at position 8.
return 0;
}
uint64_t last_bit = strtoull(uuid + 28, &end_ptr, 16);
if (end_ptr - uuid != 36) {
// first non-hexadecimal should be at string end.
return 0;
}
return (first_bit << 32) | (last_bit & 0xFFFFFFFFULL);
}

/**
* @brief Throw a Python RuntimeError for an unexpected typecode and return -1
*/
static inline int
tag_wrong_typecode(char *tag, char expected_typecode, char actual_typecode)
{
PyErr_Format(PyExc_RuntimeError,
"Wrong tag type for '%s' expected '%c' got '%c'", tag,
expected_typecode, actual_typecode);
return -1;
}

/**
* @brief correct memcmp shorthand for ease of writing.
*/
static inline bool
has_tag_id(const uint8_t *restrict tag, char *expected_tag)
{
return memcmp(tag, expected_tag, strlen(expected_tag)) == 0;
;
}

static int
NanoInfo_from_tags(const uint8_t *tags, size_t tags_length, struct NanoInfo *info)
TagInfo_from_tags(const uint8_t *tags, size_t tags_length, struct TagInfo *info)
{
info->channel_id = -1;
info->duration = 0.0;
info->start_time = 0;
info->parent_id_hash = 0;
while (tags_length > 0) {
if (tags_length < 4) {
PyErr_SetString(PyExc_ValueError, "truncated tags");
Py_ssize_t this_tag_length = tag_length(tags, tags_length);
if (this_tag_length == -1) {
return -1;
}
const uint8_t *tag_id = tags;
uint8_t tag_type = tags[2];
bool is_array = false;
const uint8_t *value_start = tags + 3;
uint32_t array_length = 1;
if (tag_type == 'B') {
is_array = true;
value_start = tags + 8;
tag_type = tags[3];
if (tags_length < 8) {
PyErr_SetString(PyExc_ValueError, "truncated tags");
const uint8_t *tag = tags;
uint8_t typecode = tags[2];

if (has_tag_id(tag, "ch")) {
Py_ssize_t channel_id = get_tag_int_value(tag);
if (channel_id == PY_SSIZE_T_MIN) {
return -1;
}
array_length = *(uint32_t *)(tags + 4);
};
size_t value_length;
switch (tag_type) {
case 'A':
value_length = 1;
break;
case 'c':
case 'C':
/* A very annoying habit of htslib to store a tag in the
smallest possible size rather than being consistent. */
value_length = 1;
if (memcmp(tag_id, "ch", 2) == 0 && tags_length >= 4) {
info->channel_id = *(uint8_t *)(value_start);
}
break;
case 's':
case 'S':
value_length = 2;
if (memcmp(tag_id, "ch", 2) == 0 && tags_length >= 5) {
info->channel_id = *(uint16_t *)(value_start);
}
break;
case 'I':
case 'i':
if (memcmp(tag_id, "ch", 2) == 0 && tags_length >= 7) {
info->channel_id = *(uint32_t *)(value_start);
}
value_length = 4;
break;
case 'f':
if (memcmp(tag_id, "du", 2) == 0 && tags_length >= 7) {
info->duration = *(float *)(value_start);
}
value_length = 4;
break;
case 'Z':
case 'H':
if (is_array) {
PyErr_Format(PyExc_ValueError, "Invalid type for array %c",
tag_type);
return -1;
}
uint8_t *string_end = memchr(value_start, 0, tags_length - 3);
if (string_end == NULL) {
PyErr_SetString(PyExc_ValueError, "truncated tags");
return -1;
}
value_length =
(string_end - value_start) + 1; // +1 for terminating null
if (memcmp(tag_id, "st", 2) == 0) {
info->start_time = time_string_to_timestamp(value_start);
}
break;
default:
PyErr_Format(PyExc_ValueError, "Unknown tag type %c", tag_type);
return -1;
info->channel_id = channel_id;
}
size_t this_tag_length =
(value_start - tags) + array_length * value_length;
if (this_tag_length > tags_length) {
PyErr_SetString(PyExc_ValueError, "truncated tags");
return -1;
else if (has_tag_id(tag, "st")) {
if (typecode != 'Z') {
return tag_wrong_typecode("st", 'Z', typecode);
}
info->start_time = time_string_to_timestamp(tags + 3);
}
else if (has_tag_id(tag, "du")) {
if (typecode != 'f') {
return tag_wrong_typecode("du", 'f', typecode);
}
info->duration = ((float *)(tags + 3))[0];
}
else if (has_tag_id(tag, "pi")) {
if (typecode != 'Z') {
return tag_wrong_typecode("pi", 'Z', typecode);
}
const uint8_t *value = tag + 3;
// -3 for tag id, typecode. -1 for terminating 0.
size_t value_length = this_tag_length - 4;
if (value_length != 36) {
PyErr_Format(PyExc_RuntimeError,
"pi tag should have a valid uuid4 format with 36 "
"characters. "
" Counted %zu.",
value_length);
return -1;
}
info->parent_id_hash = uuid4_hash((char *)value);
}
tags = tags + this_tag_length;
tags_length -= this_tag_length;
Expand Down Expand Up @@ -4842,10 +4963,15 @@ NanoStats_add_meta(NanoStats *self, struct FastqMeta *meta)
info->length = sequence_length;

if (meta->tags_length) {
if (NanoInfo_from_tags(meta->record_start + meta->tags_offset,
meta->tags_length, info) != 0) {
struct TagInfo tag_info;
if (TagInfo_from_tags(meta->record_start + meta->tags_offset,
meta->tags_length, &tag_info) != 0) {
return -1;
}
info->channel_id = tag_info.channel_id;
info->duration = tag_info.duration;
info->start_time = tag_info.start_time;
info->parent_id_hash = tag_info.parent_id_hash;
}
else if (NanoInfo_from_header(meta->name, meta->name_length, info) != 0) {
PyObject *header_obj = PyUnicode_DecodeASCII((const char *)meta->name,
Expand Down
Loading

0 comments on commit 7047a51

Please sign in to comment.