Merge pull request #218 from rhpvorderman/issue199

Add read splitting information to the report.
rhpvorderman · Dec 13, 2024 · 7047a51 · 7047a51
2 parents 8bad37a + 984921e
commit 7047a51
Show file tree

Hide file tree

Showing 5 changed files with 269 additions and 90 deletions.
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -11,6 +11,9 @@ version 0.13.0-dev
 ------------------
 + Python 3.13 support was added.
 + Python 3.8 and 3.9 are no longer supported.
++ Add a plot show how many reads originate from read splitting. This primarily
+  serves to notify the user that chimeric read splitting has already been
+  performed by dorado.
 + Add a N50 and N90 statistic to the sequence length distribution report.
 + Allow proper judging of aligned BAM files as input data by ignoring any
   secondary or supplementary alignment records. This is equivalent to running

diff --git a/src/sequali/_qc.pyi b/src/sequali/_qc.pyi
@@ -156,6 +156,7 @@ class NanoporeReadInfo:
     length: int
     cumulative_error_rate: float
     duration: float
+    parent_id_hash: int
 
 class NanoStats:
     number_of_reads: int

diff --git a/src/sequali/_qcmodule.c b/src/sequali/_qcmodule.c
@@ -4479,6 +4479,7 @@ struct NanoInfo {
     int32_t channel_id;
     uint32_t length;
     double cumulative_error_rate;
+    uint64_t parent_id_hash;
 };
 
 typedef struct {
@@ -4512,6 +4513,12 @@ NanoporeReadInfo_get_duration(NanoporeReadInfo *self, void *closure)
     return PyFloat_FromDouble((double)self->info.duration);
 }
 
+static PyObject *
+NanoporeReadInfo_get_parent_id_hash(NanoporeReadInfo *self, void *closure)
+{
+    return PyLong_FromUnsignedLongLong(self->info.parent_id_hash);
+}
+
 static PyGetSetDef NanoporeReadInfo_properties[] = {
     {"start_time", (getter)NanoporeReadInfo_get_start_time, NULL,
      "unix UTC timestamp for start time", NULL},
@@ -4521,6 +4528,8 @@ static PyGetSetDef NanoporeReadInfo_properties[] = {
     {"cumulative_error_rate", (getter)NanoporeReadInfo_get_cumulative_error_rate,
      NULL, "sum off all the bases' error rates.", NULL},
     {"duration", (getter)NanoporeReadInfo_get_duration, NULL, NULL, NULL},
+    {"parent_id_hash", (getter)NanoporeReadInfo_get_parent_id_hash, NULL, NULL,
+     NULL},
     {NULL},
 };
 
@@ -4717,93 +4726,205 @@ NanoInfo_from_header(const uint8_t *header, size_t header_length,
     return 0;
 }
 
+static Py_ssize_t
+get_tag_int_value(const uint8_t *tag)
+{
+    uint8_t tag_type = tag[2];
+    const uint8_t *value_start = tag + 3;
+    switch (tag_type) {
+        case 'c':
+            return ((int8_t *)value_start)[0];
+        case 'C':
+            return ((uint8_t *)value_start)[0];
+        case 's':
+            return ((int16_t *)value_start)[0];
+        case 'S':
+            return ((uint16_t *)value_start)[0];
+        case 'i':
+            return ((int32_t *)value_start)[0];
+        case 'I':
+            return ((uint32_t *)value_start)[0];
+        default:
+            return PY_SSIZE_T_MIN;
+    }
+}
+
+static Py_ssize_t
+tag_length(const uint8_t *tag, size_t maximum_tag_length)
+{
+    if (maximum_tag_length < 4) {
+        PyErr_SetString(PyExc_ValueError, "truncated tags");
+        return -1;
+    }
+    uint8_t tag_type = tag[2];
+    const uint8_t *value_start = tag + 3;
+    size_t value_length;
+    bool is_array = false;
+    uint32_t array_length = 1;
+    if (tag_type == 'B') {
+        is_array = true;
+        value_start = tag + 8;
+        tag_type = tag[3];
+        if (maximum_tag_length < 8) {
+            PyErr_SetString(PyExc_ValueError, "truncated tags");
+            return -1;
+        }
+        array_length = *(uint32_t *)(tag + 4);
+    };
+
+    switch (tag_type) {
+        case 'A':
+        case 'c':
+        case 'C':
+            value_length = 1;
+            break;
+        case 's':
+        case 'S':
+            value_length = 2;
+            break;
+        case 'I':
+        case 'i':
+        case 'f':
+            value_length = 4;
+            break;
+        case 'Z':
+        case 'H':
+            if (is_array) {
+                PyErr_Format(PyExc_ValueError, "Invalid type for array %c",
+                             tag_type);
+                return -1;
+            }
+            uint8_t *string_end = memchr(value_start, 0, maximum_tag_length - 3);
+            if (string_end == NULL) {
+                PyErr_SetString(PyExc_ValueError, "truncated tags");
+                return -1;
+            }
+            value_length =
+                (string_end - value_start) + 1;  // +1 for terminating null
+            break;
+        default:
+            PyErr_Format(PyExc_ValueError, "Unknown tag type %c", tag_type);
+            return -1;
+    }
+    size_t this_tag_length = (value_start - tag) + array_length * value_length;
+    if (this_tag_length > maximum_tag_length) {
+        PyErr_SetString(PyExc_ValueError, "truncated tags");
+        return -1;
+    }
+    return this_tag_length;
+}
+
+struct TagInfo {
+    int32_t channel_id;
+    float duration;
+    time_t start_time;
+    uint64_t parent_id_hash;
+};
+
+/**
+ * @brief "Hash" a uuid4 by using the first 8 digits and last 8 digits for
+ * 64 random bits. Return 0 on error.
+ */
+static uint64_t
+uuid4_hash(char *uuid)
+{
+    /* UUID4 takes the form of,
+        xxxxxxxx-xxxx-Mxxx-Nxxx-xxxxxxxxxxxx
+        ^^^^^^^^                    ^^^^^^^^
+        These hexadecimal digits are used, 16 in total. M should be 4. N can
+        be 8,9,A,B,C,D. The rest of the digits are andom.
+     */
+    if (uuid[8] != '-' || uuid[13] != '-' ||
+        uuid[14] != '4' ||  // UUID version 4 check.
+        uuid[18] != '-' || uuid[23] != '-' || uuid[36] != 0) {
+        return 0;
+    }
+    char *end_ptr = uuid;
+    uint64_t first_bit = strtoull(uuid, &end_ptr, 16);
+    if (end_ptr - uuid != 8) {
+        // strtoull stops at first non-hexadecimal. This should be at position 8.
+        return 0;
+    }
+    uint64_t last_bit = strtoull(uuid + 28, &end_ptr, 16);
+    if (end_ptr - uuid != 36) {
+        // first non-hexadecimal should be at string end.
+        return 0;
+    }
+    return (first_bit << 32) | (last_bit & 0xFFFFFFFFULL);
+}
+
+/**
+ * @brief Throw a Python RuntimeError for an unexpected typecode and return -1
+ */
+static inline int
+tag_wrong_typecode(char *tag, char expected_typecode, char actual_typecode)
+{
+    PyErr_Format(PyExc_RuntimeError,
+                 "Wrong tag type for '%s' expected '%c' got '%c'", tag,
+                 expected_typecode, actual_typecode);
+    return -1;
+}
+
+/**
+ * @brief correct memcmp shorthand for ease of writing.
+ */
+static inline bool
+has_tag_id(const uint8_t *restrict tag, char *expected_tag)
+{
+    return memcmp(tag, expected_tag, strlen(expected_tag)) == 0;
+    ;
+}
+
 static int
-NanoInfo_from_tags(const uint8_t *tags, size_t tags_length, struct NanoInfo *info)
+TagInfo_from_tags(const uint8_t *tags, size_t tags_length, struct TagInfo *info)
 {
     info->channel_id = -1;
     info->duration = 0.0;
     info->start_time = 0;
+    info->parent_id_hash = 0;
     while (tags_length > 0) {
-        if (tags_length < 4) {
-            PyErr_SetString(PyExc_ValueError, "truncated tags");
+        Py_ssize_t this_tag_length = tag_length(tags, tags_length);
+        if (this_tag_length == -1) {
             return -1;
         }
-        const uint8_t *tag_id = tags;
-        uint8_t tag_type = tags[2];
-        bool is_array = false;
-        const uint8_t *value_start = tags + 3;
-        uint32_t array_length = 1;
-        if (tag_type == 'B') {
-            is_array = true;
-            value_start = tags + 8;
-            tag_type = tags[3];
-            if (tags_length < 8) {
-                PyErr_SetString(PyExc_ValueError, "truncated tags");
+        const uint8_t *tag = tags;
+        uint8_t typecode = tags[2];
+
+        if (has_tag_id(tag, "ch")) {
+            Py_ssize_t channel_id = get_tag_int_value(tag);
+            if (channel_id == PY_SSIZE_T_MIN) {
                 return -1;
             }
-            array_length = *(uint32_t *)(tags + 4);
-        };
-        size_t value_length;
-        switch (tag_type) {
-            case 'A':
-                value_length = 1;
-                break;
-            case 'c':
-            case 'C':
-                /* A very annoying habit of htslib to store a tag in the
-                   smallest possible size rather than being consistent. */
-                value_length = 1;
-                if (memcmp(tag_id, "ch", 2) == 0 && tags_length >= 4) {
-                    info->channel_id = *(uint8_t *)(value_start);
-                }
-                break;
-            case 's':
-            case 'S':
-                value_length = 2;
-                if (memcmp(tag_id, "ch", 2) == 0 && tags_length >= 5) {
-                    info->channel_id = *(uint16_t *)(value_start);
-                }
-                break;
-            case 'I':
-            case 'i':
-                if (memcmp(tag_id, "ch", 2) == 0 && tags_length >= 7) {
-                    info->channel_id = *(uint32_t *)(value_start);
-                }
-                value_length = 4;
-                break;
-            case 'f':
-                if (memcmp(tag_id, "du", 2) == 0 && tags_length >= 7) {
-                    info->duration = *(float *)(value_start);
-                }
-                value_length = 4;
-                break;
-            case 'Z':
-            case 'H':
-                if (is_array) {
-                    PyErr_Format(PyExc_ValueError, "Invalid type for array %c",
-                                 tag_type);
-                    return -1;
-                }
-                uint8_t *string_end = memchr(value_start, 0, tags_length - 3);
-                if (string_end == NULL) {
-                    PyErr_SetString(PyExc_ValueError, "truncated tags");
-                    return -1;
-                }
-                value_length =
-                    (string_end - value_start) + 1;  // +1 for terminating null
-                if (memcmp(tag_id, "st", 2) == 0) {
-                    info->start_time = time_string_to_timestamp(value_start);
-                }
-                break;
-            default:
-                PyErr_Format(PyExc_ValueError, "Unknown tag type %c", tag_type);
-                return -1;
+            info->channel_id = channel_id;
         }
-        size_t this_tag_length =
-            (value_start - tags) + array_length * value_length;
-        if (this_tag_length > tags_length) {
-            PyErr_SetString(PyExc_ValueError, "truncated tags");
-            return -1;
+        else if (has_tag_id(tag, "st")) {
+            if (typecode != 'Z') {
+                return tag_wrong_typecode("st", 'Z', typecode);
+            }
+            info->start_time = time_string_to_timestamp(tags + 3);
+        }
+        else if (has_tag_id(tag, "du")) {
+            if (typecode != 'f') {
+                return tag_wrong_typecode("du", 'f', typecode);
+            }
+            info->duration = ((float *)(tags + 3))[0];
+        }
+        else if (has_tag_id(tag, "pi")) {
+            if (typecode != 'Z') {
+                return tag_wrong_typecode("pi", 'Z', typecode);
+            }
+            const uint8_t *value = tag + 3;
+            // -3 for tag id, typecode. -1 for terminating 0.
+            size_t value_length = this_tag_length - 4;
+            if (value_length != 36) {
+                PyErr_Format(PyExc_RuntimeError,
+                             "pi tag should have a valid uuid4 format with 36 "
+                             "characters. "
+                             " Counted %zu.",
+                             value_length);
+                return -1;
+            }
+            info->parent_id_hash = uuid4_hash((char *)value);
         }
         tags = tags + this_tag_length;
         tags_length -= this_tag_length;
@@ -4842,10 +4963,15 @@ NanoStats_add_meta(NanoStats *self, struct FastqMeta *meta)
     info->length = sequence_length;
 
     if (meta->tags_length) {
-        if (NanoInfo_from_tags(meta->record_start + meta->tags_offset,
-                               meta->tags_length, info) != 0) {
+        struct TagInfo tag_info;
+        if (TagInfo_from_tags(meta->record_start + meta->tags_offset,
+                              meta->tags_length, &tag_info) != 0) {
             return -1;
         }
+        info->channel_id = tag_info.channel_id;
+        info->duration = tag_info.duration;
+        info->start_time = tag_info.start_time;
+        info->parent_id_hash = tag_info.parent_id_hash;
     }
     else if (NanoInfo_from_header(meta->name, meta->name_length, info) != 0) {
         PyObject *header_obj = PyUnicode_DecodeASCII((const char *)meta->name,