Merge pull request #18 from hammerlab/trim-N-from-read-sequence

iskandr · iskandr · commit 09ce9a52ebba · 2016-04-25T12:54:02.000-04:00
Trim 'N' from nucleotide sequences when creating VariantRead
diff --git a/isovar/__init__.py b/isovar/__init__.py
@@ -14,4 +14,4 @@
 
 from __future__ import print_function, division, absolute_import
 
-__version__ = "0.0.2"
+__version__ = "0.0.3"
diff --git a/isovar/translation.py b/isovar/translation.py
@@ -158,9 +158,13 @@ def trim_sequences(variant_sequence, reference_context):
     # take the sequence PREFIX|VARIANT|SUFFIX
     # and take the complement of XIFFUS|TNAIRAV|XIFERP
     if reference_context.strand == "-":
-        cdna_prefix = cdna_suffix.reverse_complement()
-        cdna_alt = cdna_alt.reverse_complement()
-        cdna_suffix = cdna_prefix.reverse_complement()
+        # notice that we are setting the *prefix* to be reverse complement
+        # of the *suffix* and vice versa
+        cdna_prefix, cdna_alt, cdna_suffix = (
+            cdna_suffix.reverse_complement(),
+            cdna_alt.reverse_complement(),
+            cdna_prefix.reverse_complement()
+        )
 
     reference_sequence_before_variant = reference_context.sequence_before_variant_locus
 
diff --git a/isovar/variant_read.py b/isovar/variant_read.py
@@ -34,10 +34,130 @@
 VariantRead = namedtuple(
     "VariantRead", "prefix alt suffix name")
 
+
+def trim_N_nucleotides(prefix, suffix):
+    """
+    Drop all occurrences of 'N' from prefix and suffix nucleotide strings
+    by trimming.
+    """
+    if 'N' in prefix:
+        # trim prefix to exclude all occurrences of N
+        rightmost_index = prefix.rfind('N')
+        logging.debug("Trimming %d nucleotides from read prefix '%s'" % (
+            rightmost_index + 1, prefix))
+        prefix = prefix[rightmost_index + 1:]
+
+    if 'N' in suffix:
+        leftmost_index = suffix.find('N')
+        logging.debug("Trimming %d nucleotides from read suffix '%s'" % (
+            len(suffix)  - leftmost_index,
+            suffix))
+        suffix = suffix[:leftmost_index]
+
+    return prefix, suffix
+
+def convert_from_bytes_if_necessary(prefix, suffix):
+    """
+    Depending on how we extract data from pysam we may end up with either
+    a string or a byte array of nucleotides. For consistency and simplicity,
+    we want to only use strings in the rest of our code.
+    """
+    if isinstance(prefix, bytes):
+        prefix = prefix.decode('ascii')
+
+    if isinstance(suffix, bytes):
+        suffix = suffix.decode('ascii')
+
+    return prefix, suffix
+
+def variant_read_from_single_read_at_locus(read, ref, alt):
+    """
+    Given a single ReadAtLocus object, return either a VariantRead or None
+    (if the read's sequence didn't contain the variant nucleotides).
+
+    Parameters
+    ----------
+    read : ReadAtLocus
+        Read which may possibly contain the alternate nucleotides
+
+    ref : str
+        Reference sequence of the variant (empty for insertions)
+
+    alt : str
+        Alternate sequence of the variant (empty for deletions)
+
+    """
+    sequence = read.sequence
+    reference_positions = read.reference_positions
+
+    # positions of the nucleotides before and after the variant within
+    # the read sequence
+    read_pos_before = read.base0_read_position_before_variant
+    read_pos_after = read.base0_read_position_after_variant
+
+    # positions of the nucleotides before and after the variant on the
+    # reference genome
+    ref_pos_before = reference_positions[read_pos_before]
+
+    if ref_pos_before is None:
+        logging.warn(
+            "Missing reference pos for nucleotide before variant on read: %s" % (
+                read,))
+        return None
+
+    ref_pos_after = reference_positions[read_pos_after]
+
+    if ref_pos_after is None:
+        logging.warn(
+            "Missing reference pos for nucleotide after variant on read: %s" % (
+                read,))
+        return None
+
+    if len(ref) == 0:
+        if ref_pos_after - ref_pos_before != 1:
+            # if the number of nucleotides skipped isn't the same
+            # as the number of reference nucleotides in the variant then
+            # don't use this read
+            logging.debug(
+                "Positions before (%d) and after (%d) variant should be adjacent on read %s" % (
+                    ref_pos_before,
+                    ref_pos_after,
+                    read))
+            return None
+
+        # insertions require a sequence of non-aligned bases
+        # followed by the subsequence reference position
+        ref_positions_for_inserted = reference_positions[
+            read_pos_before + 1:read_pos_after]
+        if any(insert_pos is not None for insert_pos in ref_positions_for_inserted):
+            # all these inserted nucleotides should *not* align to the
+            # reference
+            logging.debug(
+                "Skipping read, inserted nucleotides shouldn't map to reference")
+            return None
+    else:
+        # substitutions and deletions
+        if ref_pos_after - ref_pos_before != len(ref) + 1:
+            # if the number of nucleotides skipped isn't the same
+            # as the number of reference nucleotides in the variant then
+            # don't use this read
+            logging.debug(
+                "Positions before (%d) and after (%d) variant should be adjacent on read %s" % (
+                    ref_pos_before,
+                    ref_pos_after,
+                    read))
+            return None
+
+    prefix = sequence[:read_pos_before + 1]
+    suffix = sequence[read_pos_after:]
+    prefix, suffix = convert_from_bytes_if_necessary(prefix, suffix)
+    prefix, suffix = trim_N_nucleotides(prefix, suffix)
+    return VariantRead(prefix, alt, suffix, name=read.name)
+
 def variant_reads_from_reads_at_locus(reads, ref, alt):
     """
-    Given a collection of pysam.AlignedSegment objects, generates a
-    sequence of VariantRead objects (which are split into prefix/variant/suffix
+    Given a collection of ReadAtLocus objects, returns a
+    list of VariantRead objects (which are split into prefix/variant/suffix
     nucleotides).
 
     Parameters
@@ -50,75 +170,14 @@ def variant_reads_from_reads_at_locus(reads, ref, alt):
     alt : str
         Alternate sequence of the variant (empty for deletions)
 
-    Returns a sequence of VariantRead objects.
+    Returns a list of VariantRead objects.
     """
+    variant_reads = []
     for read in reads:
-        sequence = read.sequence
-        reference_positions = read.reference_positions
-
-        # positions of the nucleotides before and after the variant within
-        # the read sequence
-        read_pos_before = read.base0_read_position_before_variant
-        read_pos_after = read.base0_read_position_after_variant
-
-        # positions of the nucleotides before and after the variant on the
-        # reference genome
-        ref_pos_before = reference_positions[read_pos_before]
-        if ref_pos_before is None:
-            logging.warn(
-                "Missing reference pos for nucleotide before variant on read: %s" % (
-                    read,))
-            continue
-
-        ref_pos_after = reference_positions[read_pos_after]
-        if ref_pos_after is None:
-            logging.warn(
-                "Missing reference pos for nucleotide after variant on read: %s" % (
-                    read,))
-            continue
-
-        if len(ref) == 0:
-            if ref_pos_after - ref_pos_before != 1:
-                # if the number of nucleotides skipped isn't the same
-                # as the number of reference nucleotides in the variant then
-                # don't use this read
-                logging.debug(
-                    "Positions before (%d) and after (%d) variant should be adjacent on read %s" % (
-                        ref_pos_before,
-                        ref_pos_after,
-                        read))
-                continue
-            # insertions require a sequence of non-aligned bases
-            # followed by the subsequence reference position
-            ref_positions_for_inserted = reference_positions[
-                read_pos_before + 1:read_pos_after]
-            if any(insert_pos is not None for insert_pos in ref_positions_for_inserted):
-                # all these inserted nucleotides should *not* align to the
-                # reference
-                logging.debug(
-                    "Skipping read, inserted nucleotides shouldn't map to reference")
-        else:
-            # substitutions and deletions
-            if ref_pos_after - ref_pos_before != len(ref) + 1:
-                # if the number of nucleotides skipped isn't the same
-                # as the number of reference nucleotides in the variant then
-                # don't use this read
-                logging.debug(
-                    "Positions before (%d) and after (%d) variant should be adjacent on read %s" % (
-                        ref_pos_before,
-                        ref_pos_after,
-                        read))
-                continue
-        prefix = sequence[:read_pos_before + 1]
-        suffix = sequence[read_pos_after:]
-
-        if isinstance(prefix, bytes):
-            prefix = prefix.decode('ascii')
-        if isinstance(suffix, bytes):
-            suffix = suffix.decode('ascii')
-
-        yield VariantRead(prefix, alt, suffix, name=read.name)
-
+        variant_read = variant_read_from_single_read_at_locus(read, ref, alt)
+        if variant_read is not None:
+            variant_reads.append(variant_read)
+    return variant_reads
 
 def gather_reads_for_single_variant(
         samfile,
diff --git a/test/test_variant_read.py b/test/test_variant_read.py
@@ -0,0 +1,30 @@
+import isovar
+import isovar.variant_read
+from isovar.variant_read import (
+    variant_read_from_single_read_at_locus,
+    VariantRead,
+)
+from isovar.read_at_locus import ReadAtLocus
+from nose.tools import eq_
+
+def make_read_at_locus(prefix, alt, suffix, base_quality=30, name="dummy"):
+    dummy_sequence = prefix + alt + suffix
+    return ReadAtLocus(
+        name="dummy",
+        sequence=dummy_sequence,
+        reference_positions=list(range(1, len(dummy_sequence) + 1)),
+        quality_scores=[base_quality] * len(dummy_sequence),
+        base0_read_position_before_variant=len(prefix) - 1,
+        base0_read_position_after_variant=len(prefix) + len(alt),
+    )
+
+def test_variant_read_from_single_read_at_locus_trim_N_nucleotides():
+    read_at_locus =make_read_at_locus(prefix="NCCN", alt="A", suffix="TNNA")
+    variant_read = variant_read_from_single_read_at_locus(
+        read_at_locus, ref="T", alt="A")
+    print(variant_read)
+    expected = VariantRead(prefix="", alt="A", suffix="T", name="dummy")
+    eq_(variant_read, expected)
+
+if __name__ == "__main__":
+    test_variant_read_from_single_read_at_locus_trim_N_nucleotides()

Original file line number	Diff line number	Diff line change
`@@ -14,4 +14,4 @@`
`14`	`14`
`15`	`15`	`from __future__ import print_function, division, absolute_import`
`16`	`16`
`17`		`-__version__ = "0.0.2"`
	`17`	`+__version__ = "0.0.3"`