Skip to content

Commit

Permalink
Merge pull request #216 from rhpvorderman/issue197
Browse files Browse the repository at this point in the history
Add a N50 and N90 statistic
  • Loading branch information
rhpvorderman authored Dec 4, 2024
2 parents 1967f3f + 5a4b7cc commit 919784e
Show file tree
Hide file tree
Showing 3 changed files with 34 additions and 1 deletion.
1 change: 1 addition & 0 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ version 0.13.0-dev
------------------
+ Python 3.13 support was added.
+ Python 3.8 and 3.9 are no longer supported.
+ Add a N50 and N90 statistic to the sequence length distribution report.
+ Allow proper judging of aligned BAM files as input data by ignoring any
secondary or supplementary alignment records. This is equivalent to running
``samtools fastq > input.fastq`` on the input data before submitting it to
Expand Down
30 changes: 29 additions & 1 deletion src/sequali/report_modules.py
Original file line number Diff line number Diff line change
Expand Up @@ -486,6 +486,8 @@ class SequenceLengthDistribution(ReportModule):
q90: int
q95: int
q99: int
n50: int
n90: int
read_pair_info: Optional[str] = None

def plot(self) -> pygal.Graph:
Expand All @@ -505,6 +507,13 @@ def distribution_table(self):
self.q75, self.q90, self.q95, self.q99}) == 1:
return ""
return f"""
<p class="explanation">
See this
<a
href="https://en.wikipedia.org/wiki/N50,_L50,_and_related_statistics"
>wikipedia lemma</a>
for an explanation about contiguity statistics.
</p>
<table>
<tr><th>Percentile</th><th>Read length</th></tr>
<tr><td>1</td><td style="text-align:right;">{self.q1:,}</td></tr>
Expand All @@ -517,6 +526,11 @@ def distribution_table(self):
<tr><td>95</td><td style="text-align:right;">{self.q95:,}</td></tr>
<tr><td>99</td><td style="text-align:right;">{self.q99:,}</td></tr>
</table>
<table>
<tr><th>Contiguity</th><th>Read length</th></tr>
<tr><td>N90</td><td style="text-align:right;">{self.n90:,}</td></tr>
<tr><td>N50</td><td style="text-align:right;">{self.n50:,}</td></tr>
</table>
"""

def to_html(self):
Expand Down Expand Up @@ -574,8 +588,22 @@ def from_base_count_tables(cls,
if done:
break

total_bases = sum(base_count_tables)
half_bases = total_bases // 2
ten_percent_bases = int(total_bases * 0.1)
sum_bases = 0
N50 = None
N90 = None
for length, number in enumerate(sequence_lengths):
sum_bases += length * number
if N90 is None and sum_bases >= ten_percent_bases:
N90 = length
if N50 is None and sum_bases >= half_bases:
N50 = length
break
return cls(["0"] + x_labels, [sequence_lengths[0]] + lengths,
*percentile_lengths, read_pair_info=read_pair_info) # type: ignore
*percentile_lengths, n50=N50, n90=N90, # type: ignore
read_pair_info=read_pair_info) # type: ignore


@dataclasses.dataclass
Expand Down
4 changes: 4 additions & 0 deletions tests/test_integration.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,8 @@ def test_simple_fastq(tmp_path):
assert result["summary"]["minimum_length"] == 7
assert result["summary"]["total_gc_bases"] == 4
assert result["summary"]["total_bases"] == 22
assert result["sequence_length_distribution"]["n50"] == 7
assert result["sequence_length_distribution"]["n90"] == 7


def test_empty_file(tmp_path):
Expand Down Expand Up @@ -130,6 +132,8 @@ def test_nanopore_reads(tmp_path):
fastq_json = tmp_path / "100_nanopore_reads.fastq.gz.json"
result = json.loads(fastq_json.read_text())
assert result["summary"]["total_reads"] == 100
assert result["sequence_length_distribution"]["n50"] == 59502
assert result["sequence_length_distribution"]["n90"] == 7517


def test_dorado_nanopore_bam(tmp_path):
Expand Down

0 comments on commit 919784e

Please sign in to comment.