Merge pull request #216 from rhpvorderman/issue197

Add a N50 and N90 statistic
rhpvorderman · Dec 4, 2024 · 919784e · 919784e
2 parents 1967f3f + 5a4b7cc
commit 919784e
Show file tree

Hide file tree

Showing 3 changed files with 34 additions and 1 deletion.
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -11,6 +11,7 @@ version 0.13.0-dev
 ------------------
 + Python 3.13 support was added.
 + Python 3.8 and 3.9 are no longer supported.
++ Add a N50 and N90 statistic to the sequence length distribution report.
 + Allow proper judging of aligned BAM files as input data by ignoring any
   secondary or supplementary alignment records. This is equivalent to running
   ``samtools fastq > input.fastq`` on the input data before submitting it to

diff --git a/src/sequali/report_modules.py b/src/sequali/report_modules.py
@@ -486,6 +486,8 @@ class SequenceLengthDistribution(ReportModule):
     q90: int
     q95: int
     q99: int
+    n50: int
+    n90: int
     read_pair_info: Optional[str] = None
 
     def plot(self) -> pygal.Graph:
@@ -505,6 +507,13 @@ def distribution_table(self):
                 self.q75, self.q90, self.q95, self.q99}) == 1:
             return ""
         return f"""
+            <p class="explanation">
+            See this
+            <a
+            href="https://en.wikipedia.org/wiki/N50,_L50,_and_related_statistics"
+            >wikipedia lemma</a>
+                for an explanation about contiguity statistics.
+            </p>
             <table>
                 <tr><th>Percentile</th><th>Read length</th></tr>
                 <tr><td>1</td><td style="text-align:right;">{self.q1:,}</td></tr>
@@ -517,6 +526,11 @@ def distribution_table(self):
                 <tr><td>95</td><td style="text-align:right;">{self.q95:,}</td></tr>
                 <tr><td>99</td><td style="text-align:right;">{self.q99:,}</td></tr>
             </table>
+            <table>
+                <tr><th>Contiguity</th><th>Read length</th></tr>
+                <tr><td>N90</td><td style="text-align:right;">{self.n90:,}</td></tr>
+                <tr><td>N50</td><td style="text-align:right;">{self.n50:,}</td></tr>
+            </table>
         """
 
     def to_html(self):
@@ -574,8 +588,22 @@ def from_base_count_tables(cls,
             if done:
                 break
 
+        total_bases = sum(base_count_tables)
+        half_bases = total_bases // 2
+        ten_percent_bases = int(total_bases * 0.1)
+        sum_bases = 0
+        N50 = None
+        N90 = None
+        for length, number in enumerate(sequence_lengths):
+            sum_bases += length * number
+            if N90 is None and sum_bases >= ten_percent_bases:
+                N90 = length
+            if N50 is None and sum_bases >= half_bases:
+                N50 = length
+                break
         return cls(["0"] + x_labels, [sequence_lengths[0]] + lengths,
-                   *percentile_lengths, read_pair_info=read_pair_info)  # type: ignore
+                   *percentile_lengths, n50=N50, n90=N90,  # type: ignore
+                   read_pair_info=read_pair_info)  # type: ignore
 
 
 @dataclasses.dataclass

diff --git a/tests/test_integration.py b/tests/test_integration.py
@@ -36,6 +36,8 @@ def test_simple_fastq(tmp_path):
     assert result["summary"]["minimum_length"] == 7
     assert result["summary"]["total_gc_bases"] == 4
     assert result["summary"]["total_bases"] == 22
+    assert result["sequence_length_distribution"]["n50"] == 7
+    assert result["sequence_length_distribution"]["n90"] == 7
 
 
 def test_empty_file(tmp_path):
@@ -130,6 +132,8 @@ def test_nanopore_reads(tmp_path):
     fastq_json = tmp_path / "100_nanopore_reads.fastq.gz.json"
     result = json.loads(fastq_json.read_text())
     assert result["summary"]["total_reads"] == 100
+    assert result["sequence_length_distribution"]["n50"] == 59502
+    assert result["sequence_length_distribution"]["n90"] == 7517
 
 
 def test_dorado_nanopore_bam(tmp_path):