Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add a N50 and N90 statistic #216

Merged
merged 1 commit into from
Dec 4, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ version 0.13.0-dev
------------------
+ Python 3.13 support was added.
+ Python 3.8 and 3.9 are no longer supported.
+ Add a N50 and N90 statistic to the sequence length distribution report.
+ Allow proper judging of aligned BAM files as input data by ignoring any
secondary or supplementary alignment records. This is equivalent to running
``samtools fastq > input.fastq`` on the input data before submitting it to
Expand Down
30 changes: 29 additions & 1 deletion src/sequali/report_modules.py
Original file line number Diff line number Diff line change
Expand Up @@ -486,6 +486,8 @@ class SequenceLengthDistribution(ReportModule):
q90: int
q95: int
q99: int
n50: int
n90: int
read_pair_info: Optional[str] = None

def plot(self) -> pygal.Graph:
Expand All @@ -505,6 +507,13 @@ def distribution_table(self):
self.q75, self.q90, self.q95, self.q99}) == 1:
return ""
return f"""
<p class="explanation">
See this
<a
href="https://en.wikipedia.org/wiki/N50,_L50,_and_related_statistics"
>wikipedia lemma</a>
for an explanation about contiguity statistics.
</p>
<table>
<tr><th>Percentile</th><th>Read length</th></tr>
<tr><td>1</td><td style="text-align:right;">{self.q1:,}</td></tr>
Expand All @@ -517,6 +526,11 @@ def distribution_table(self):
<tr><td>95</td><td style="text-align:right;">{self.q95:,}</td></tr>
<tr><td>99</td><td style="text-align:right;">{self.q99:,}</td></tr>
</table>
<table>
<tr><th>Contiguity</th><th>Read length</th></tr>
<tr><td>N90</td><td style="text-align:right;">{self.n90:,}</td></tr>
<tr><td>N50</td><td style="text-align:right;">{self.n50:,}</td></tr>
</table>
"""

def to_html(self):
Expand Down Expand Up @@ -574,8 +588,22 @@ def from_base_count_tables(cls,
if done:
break

total_bases = sum(base_count_tables)
half_bases = total_bases // 2
ten_percent_bases = int(total_bases * 0.1)
sum_bases = 0
N50 = None
N90 = None
for length, number in enumerate(sequence_lengths):
sum_bases += length * number
if N90 is None and sum_bases >= ten_percent_bases:
N90 = length
if N50 is None and sum_bases >= half_bases:
N50 = length
break
return cls(["0"] + x_labels, [sequence_lengths[0]] + lengths,
*percentile_lengths, read_pair_info=read_pair_info) # type: ignore
*percentile_lengths, n50=N50, n90=N90, # type: ignore
read_pair_info=read_pair_info) # type: ignore


@dataclasses.dataclass
Expand Down
4 changes: 4 additions & 0 deletions tests/test_integration.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,8 @@ def test_simple_fastq(tmp_path):
assert result["summary"]["minimum_length"] == 7
assert result["summary"]["total_gc_bases"] == 4
assert result["summary"]["total_bases"] == 22
assert result["sequence_length_distribution"]["n50"] == 7
assert result["sequence_length_distribution"]["n90"] == 7


def test_empty_file(tmp_path):
Expand Down Expand Up @@ -130,6 +132,8 @@ def test_nanopore_reads(tmp_path):
fastq_json = tmp_path / "100_nanopore_reads.fastq.gz.json"
result = json.loads(fastq_json.read_text())
assert result["summary"]["total_reads"] == 100
assert result["sequence_length_distribution"]["n50"] == 59502
assert result["sequence_length_distribution"]["n90"] == 7517


def test_dorado_nanopore_bam(tmp_path):
Expand Down
Loading