Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add read splitting information to the report. #218

Merged
merged 6 commits into from
Dec 13, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,9 @@ version 0.13.0-dev
------------------
+ Python 3.13 support was added.
+ Python 3.8 and 3.9 are no longer supported.
+ Add a plot show how many reads originate from read splitting. This primarily
serves to notify the user that chimeric read splitting has already been
performed by dorado.
+ Add a N50 and N90 statistic to the sequence length distribution report.
+ Allow proper judging of aligned BAM files as input data by ignoring any
secondary or supplementary alignment records. This is equivalent to running
Expand Down
1 change: 1 addition & 0 deletions src/sequali/_qc.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,7 @@ class NanoporeReadInfo:
length: int
cumulative_error_rate: float
duration: float
parent_id_hash: int

class NanoStats:
number_of_reads: int
Expand Down
282 changes: 204 additions & 78 deletions src/sequali/_qcmodule.c
Original file line number Diff line number Diff line change
Expand Up @@ -4479,6 +4479,7 @@ struct NanoInfo {
int32_t channel_id;
uint32_t length;
double cumulative_error_rate;
uint64_t parent_id_hash;
};

typedef struct {
Expand Down Expand Up @@ -4512,6 +4513,12 @@ NanoporeReadInfo_get_duration(NanoporeReadInfo *self, void *closure)
return PyFloat_FromDouble((double)self->info.duration);
}

static PyObject *
NanoporeReadInfo_get_parent_id_hash(NanoporeReadInfo *self, void *closure)
{
return PyLong_FromUnsignedLongLong(self->info.parent_id_hash);
}

static PyGetSetDef NanoporeReadInfo_properties[] = {
{"start_time", (getter)NanoporeReadInfo_get_start_time, NULL,
"unix UTC timestamp for start time", NULL},
Expand All @@ -4521,6 +4528,8 @@ static PyGetSetDef NanoporeReadInfo_properties[] = {
{"cumulative_error_rate", (getter)NanoporeReadInfo_get_cumulative_error_rate,
NULL, "sum off all the bases' error rates.", NULL},
{"duration", (getter)NanoporeReadInfo_get_duration, NULL, NULL, NULL},
{"parent_id_hash", (getter)NanoporeReadInfo_get_parent_id_hash, NULL, NULL,
NULL},
{NULL},
};

Expand Down Expand Up @@ -4717,93 +4726,205 @@ NanoInfo_from_header(const uint8_t *header, size_t header_length,
return 0;
}

static Py_ssize_t
get_tag_int_value(const uint8_t *tag)
{
uint8_t tag_type = tag[2];
const uint8_t *value_start = tag + 3;
switch (tag_type) {
case 'c':
return ((int8_t *)value_start)[0];
case 'C':
return ((uint8_t *)value_start)[0];
case 's':
return ((int16_t *)value_start)[0];
case 'S':
return ((uint16_t *)value_start)[0];
case 'i':
return ((int32_t *)value_start)[0];
case 'I':
return ((uint32_t *)value_start)[0];
default:
return PY_SSIZE_T_MIN;
}
}

static Py_ssize_t
tag_length(const uint8_t *tag, size_t maximum_tag_length)
{
if (maximum_tag_length < 4) {
PyErr_SetString(PyExc_ValueError, "truncated tags");
return -1;
}
uint8_t tag_type = tag[2];
const uint8_t *value_start = tag + 3;
size_t value_length;
bool is_array = false;
uint32_t array_length = 1;
if (tag_type == 'B') {
is_array = true;
value_start = tag + 8;
tag_type = tag[3];
if (maximum_tag_length < 8) {
PyErr_SetString(PyExc_ValueError, "truncated tags");
return -1;
}
array_length = *(uint32_t *)(tag + 4);
};

switch (tag_type) {
case 'A':
case 'c':
case 'C':
value_length = 1;
break;
case 's':
case 'S':
value_length = 2;
break;
case 'I':
case 'i':
case 'f':
value_length = 4;
break;
case 'Z':
case 'H':
if (is_array) {
PyErr_Format(PyExc_ValueError, "Invalid type for array %c",
tag_type);
return -1;
}
uint8_t *string_end = memchr(value_start, 0, maximum_tag_length - 3);
if (string_end == NULL) {
PyErr_SetString(PyExc_ValueError, "truncated tags");
return -1;
}
value_length =
(string_end - value_start) + 1; // +1 for terminating null
break;
default:
PyErr_Format(PyExc_ValueError, "Unknown tag type %c", tag_type);
return -1;
}
size_t this_tag_length = (value_start - tag) + array_length * value_length;
if (this_tag_length > maximum_tag_length) {
PyErr_SetString(PyExc_ValueError, "truncated tags");
return -1;
}
return this_tag_length;
}

struct TagInfo {
int32_t channel_id;
float duration;
time_t start_time;
uint64_t parent_id_hash;
};

/**
* @brief "Hash" a uuid4 by using the first 8 digits and last 8 digits for
* 64 random bits. Return 0 on error.
*/
static uint64_t
uuid4_hash(char *uuid)
{
/* UUID4 takes the form of,
xxxxxxxx-xxxx-Mxxx-Nxxx-xxxxxxxxxxxx
^^^^^^^^ ^^^^^^^^
These hexadecimal digits are used, 16 in total. M should be 4. N can
be 8,9,A,B,C,D. The rest of the digits are andom.
*/
if (uuid[8] != '-' || uuid[13] != '-' ||
uuid[14] != '4' || // UUID version 4 check.
uuid[18] != '-' || uuid[23] != '-' || uuid[36] != 0) {
return 0;
}
char *end_ptr = uuid;
uint64_t first_bit = strtoull(uuid, &end_ptr, 16);
if (end_ptr - uuid != 8) {
// strtoull stops at first non-hexadecimal. This should be at position 8.
return 0;
}
uint64_t last_bit = strtoull(uuid + 28, &end_ptr, 16);
if (end_ptr - uuid != 36) {
// first non-hexadecimal should be at string end.
return 0;
}
return (first_bit << 32) | (last_bit & 0xFFFFFFFFULL);
}

/**
* @brief Throw a Python RuntimeError for an unexpected typecode and return -1
*/
static inline int
tag_wrong_typecode(char *tag, char expected_typecode, char actual_typecode)
{
PyErr_Format(PyExc_RuntimeError,
"Wrong tag type for '%s' expected '%c' got '%c'", tag,
expected_typecode, actual_typecode);
return -1;
}

/**
* @brief correct memcmp shorthand for ease of writing.
*/
static inline bool
has_tag_id(const uint8_t *restrict tag, char *expected_tag)
{
return memcmp(tag, expected_tag, strlen(expected_tag)) == 0;
;
}

static int
NanoInfo_from_tags(const uint8_t *tags, size_t tags_length, struct NanoInfo *info)
TagInfo_from_tags(const uint8_t *tags, size_t tags_length, struct TagInfo *info)
{
info->channel_id = -1;
info->duration = 0.0;
info->start_time = 0;
info->parent_id_hash = 0;
while (tags_length > 0) {
if (tags_length < 4) {
PyErr_SetString(PyExc_ValueError, "truncated tags");
Py_ssize_t this_tag_length = tag_length(tags, tags_length);
if (this_tag_length == -1) {
return -1;
}
const uint8_t *tag_id = tags;
uint8_t tag_type = tags[2];
bool is_array = false;
const uint8_t *value_start = tags + 3;
uint32_t array_length = 1;
if (tag_type == 'B') {
is_array = true;
value_start = tags + 8;
tag_type = tags[3];
if (tags_length < 8) {
PyErr_SetString(PyExc_ValueError, "truncated tags");
const uint8_t *tag = tags;
uint8_t typecode = tags[2];

if (has_tag_id(tag, "ch")) {
Py_ssize_t channel_id = get_tag_int_value(tag);
if (channel_id == PY_SSIZE_T_MIN) {
return -1;
}
array_length = *(uint32_t *)(tags + 4);
};
size_t value_length;
switch (tag_type) {
case 'A':
value_length = 1;
break;
case 'c':
case 'C':
/* A very annoying habit of htslib to store a tag in the
smallest possible size rather than being consistent. */
value_length = 1;
if (memcmp(tag_id, "ch", 2) == 0 && tags_length >= 4) {
info->channel_id = *(uint8_t *)(value_start);
}
break;
case 's':
case 'S':
value_length = 2;
if (memcmp(tag_id, "ch", 2) == 0 && tags_length >= 5) {
info->channel_id = *(uint16_t *)(value_start);
}
break;
case 'I':
case 'i':
if (memcmp(tag_id, "ch", 2) == 0 && tags_length >= 7) {
info->channel_id = *(uint32_t *)(value_start);
}
value_length = 4;
break;
case 'f':
if (memcmp(tag_id, "du", 2) == 0 && tags_length >= 7) {
info->duration = *(float *)(value_start);
}
value_length = 4;
break;
case 'Z':
case 'H':
if (is_array) {
PyErr_Format(PyExc_ValueError, "Invalid type for array %c",
tag_type);
return -1;
}
uint8_t *string_end = memchr(value_start, 0, tags_length - 3);
if (string_end == NULL) {
PyErr_SetString(PyExc_ValueError, "truncated tags");
return -1;
}
value_length =
(string_end - value_start) + 1; // +1 for terminating null
if (memcmp(tag_id, "st", 2) == 0) {
info->start_time = time_string_to_timestamp(value_start);
}
break;
default:
PyErr_Format(PyExc_ValueError, "Unknown tag type %c", tag_type);
return -1;
info->channel_id = channel_id;
}
size_t this_tag_length =
(value_start - tags) + array_length * value_length;
if (this_tag_length > tags_length) {
PyErr_SetString(PyExc_ValueError, "truncated tags");
return -1;
else if (has_tag_id(tag, "st")) {
if (typecode != 'Z') {
return tag_wrong_typecode("st", 'Z', typecode);
}
info->start_time = time_string_to_timestamp(tags + 3);
}
else if (has_tag_id(tag, "du")) {
if (typecode != 'f') {
return tag_wrong_typecode("du", 'f', typecode);
}
info->duration = ((float *)(tags + 3))[0];
}
else if (has_tag_id(tag, "pi")) {
if (typecode != 'Z') {
return tag_wrong_typecode("pi", 'Z', typecode);
}
const uint8_t *value = tag + 3;
// -3 for tag id, typecode. -1 for terminating 0.
size_t value_length = this_tag_length - 4;
if (value_length != 36) {
PyErr_Format(PyExc_RuntimeError,
"pi tag should have a valid uuid4 format with 36 "
"characters. "
" Counted %zu.",
value_length);
return -1;
}
info->parent_id_hash = uuid4_hash((char *)value);
}
tags = tags + this_tag_length;
tags_length -= this_tag_length;
Expand Down Expand Up @@ -4842,10 +4963,15 @@ NanoStats_add_meta(NanoStats *self, struct FastqMeta *meta)
info->length = sequence_length;

if (meta->tags_length) {
if (NanoInfo_from_tags(meta->record_start + meta->tags_offset,
meta->tags_length, info) != 0) {
struct TagInfo tag_info;
if (TagInfo_from_tags(meta->record_start + meta->tags_offset,
meta->tags_length, &tag_info) != 0) {
return -1;
}
info->channel_id = tag_info.channel_id;
info->duration = tag_info.duration;
info->start_time = tag_info.start_time;
info->parent_id_hash = tag_info.parent_id_hash;
}
else if (NanoInfo_from_header(meta->name, meta->name_length, info) != 0) {
PyObject *header_obj = PyUnicode_DecodeASCII((const char *)meta->name,
Expand Down
Loading
Loading