Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Primer3 takes a list of VCFs instead of a VariantLookup object #62

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 4 additions & 8 deletions prymer/api/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,13 @@
from prymer.api.span import BedLikeCoords
from prymer.api.span import Span
from prymer.api.span import Strand
from prymer.api.variant_lookup import FileBasedVariantLookup
from prymer.api.variant_lookup import SimpleVariant

# from prymer.api.variant_lookup import _DiskBasedLookup
# from prymer.api.variant_lookup import _InMemoryLookup
# from prymer.api.variant_lookup import _VariantLookup
from prymer.api.variant_lookup import VariantLookup
from prymer.api.variant_lookup import VariantOverlapDetector
from prymer.api.variant_lookup import VariantType
from prymer.api.variant_lookup import cached
from prymer.api.variant_lookup import disk_based

__all__ = [
"ClusteredIntervals",
Expand All @@ -30,8 +30,4 @@
"VariantType",
"SimpleVariant",
"VariantLookup",
"FileBasedVariantLookup",
"VariantOverlapDetector",
"cached",
"disk_based",
]
180 changes: 105 additions & 75 deletions prymer/api/variant_lookup.py
Original file line number Diff line number Diff line change
@@ -1,34 +1,25 @@
"""
# Variant Lookup Class and Methods

This module contains the abstract class
[`VariantLookup`][prymer.api.variant_lookup.VariantLookup] to facilitate retrieval of
variants that overlap a given genomic coordinate range. Concrete implementations must implement the
[`query()`][prymer.api.variant_lookup.VariantLookup.query] method for retrieving variants
This module contains the class [`VariantLookup`][prymer.api.variant_lookup.VariantLookup] to
facilitate retrieval of variants that overlap a given genomic coordinate range.
The [`query()`][`prymer.api.variant_lookup.VariantLookup.query`] method is used to retrieve variants
that overlap the given range.

Two concrete implementations are provided that both take a list of VCF files to be queried:

- [`FileBasedVariantLookup`][prymer.api.variant_lookup.FileBasedVariantLookup] -- performs
disk-based retrieval of variants (using a VCF index). This class is recommended for large VCFs. The
[`disk_based()`][prymer.api.variant_lookup.disk_based] alternative constructor is
provided for easy construction of this object.
- [`VariantOverlapDetector`][prymer.api.variant_lookup.VariantOverlapDetector] -- reads in
variants into memory and uses an
[`VariantLookup`][prymer.api.variant_lookup.VariantLookup] needs a list of VCF files to be queried,
a minimum Minimum Allele Frequency with which to optionally filter variants, a boolean `include_missing_mafs` flag, and a
boolean `cached` flag. It is recommended to set `cached` to `False` for large VCFs. When `cached`
is True, variants from smaller VCFs are loaded into memory.
[`pybedlite.overlap_detector.OverlapDetector`](https://pybedlite.readthedocs.io/en/latest/api.html#pybedlite.overlap_detector.OverlapDetector)
for querying. This class is recommended for small VCFs. The
[`cached()`][prymer.api.variant_lookup.cached] alternative constructor is provided for
easy construction of this object.

Each class can also use minor allele frequency (MAF) to filter variants.
is used for querying.

The helper class `SimpleVariant` is included to facilitate VCF querying and reporting out results.

## Examples

```python
>>> from pathlib import Path
>>> lookup = cached(vcf_paths=[Path("./tests/api/data/miniref.variants.vcf.gz")], min_maf=0.00, include_missing_mafs=True)
>>> lookup = VariantLookup(vcf_paths=[Path("./tests/api/data/miniref.variants.vcf.gz")], min_maf=0.00, include_missing_mafs=True)
>>> lookup.query(refname="chr2", start=7999, end=8000)
[SimpleVariant(id='complex-variant-sv-1/1', refname='chr2', pos=8000, ref='T', alt='<DEL>', end=8000, variant_type=<VariantType.OTHER: 'OTHER'>, maf=None)]
>>> variants = lookup.query(refname="chr2", start=7999, end=9900)
Expand Down Expand Up @@ -67,7 +58,6 @@
from types import TracebackType
from typing import ContextManager
from typing import Optional
from typing import final

import pysam
from fgpyo.vcf import reader
Expand Down Expand Up @@ -237,7 +227,7 @@
)


class VariantLookup(ABC):
class _VariantLookup(ABC):
"""Base class to represent a variant from a given genomic range.

Attributes:
Expand All @@ -258,7 +248,11 @@
self.min_maf: Optional[float] = min_maf
self.include_missing_mafs: bool = include_missing_mafs

@final
@abstractmethod
def close(self) -> None:
pass

Check warning on line 253 in prymer/api/variant_lookup.py

View check run for this annotation

Codecov / codecov/patch

prymer/api/variant_lookup.py#L253

Added line #L253 was not covered by tests

@abstractmethod
def query(
self,
refname: str,
Expand All @@ -277,20 +271,7 @@
include_missing_mafs: whether to include variants with a missing MAF
(overrides self.include_missing_mafs)
"""
if maf is None:
maf = self.min_maf
if include_missing_mafs is None:
include_missing_mafs = self.include_missing_mafs

variants = self._query(refname=refname, start=start, end=end)
if len(variants) == 0:
_logger.debug(f"No variants extracted from region of interest: {refname}:{start}-{end}")
if maf is None or maf <= 0.0:
return variants
elif include_missing_mafs: # return variants with a MAF above threshold or missing
return [v for v in variants if (v.maf is None or v.maf >= maf)]
else:
return [v for v in variants if v.maf is not None and v.maf >= maf]
pass

Check warning on line 274 in prymer/api/variant_lookup.py

View check run for this annotation

Codecov / codecov/patch

prymer/api/variant_lookup.py#L274

Added line #L274 was not covered by tests

@staticmethod
def to_variants(
Expand Down Expand Up @@ -319,20 +300,83 @@
simple_vars.extend(simple_variants)
return sorted(simple_vars, key=lambda v: (v.pos, v.id))

@abstractmethod
def _query(self, refname: str, start: int, end: int) -> list[SimpleVariant]:
"""Subclasses must implement this method."""

class VariantLookup(ContextManager):
def __init__(
self,
vcf_paths: list[Path],
min_maf: Optional[float],
include_missing_mafs: bool,
cached: bool = True,
):
self.vcf_paths: list[Path] = vcf_paths
self.min_maf: Optional[float] = min_maf
self.include_missing_mafs: bool = include_missing_mafs
self.cached: bool = cached

self._lookup: _VariantLookup

if cached:
self._lookup = _InMemoryLookup(
vcf_paths=vcf_paths, min_maf=min_maf, include_missing_mafs=include_missing_mafs
)
else:
self._lookup = _DiskBasedLookup(
vcf_paths=vcf_paths, min_maf=min_maf, include_missing_mafs=include_missing_mafs
)

Comment on lines +304 to +327
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue

Add input validation for vcf_paths.

Validate vcf_paths before creating lookup implementation.

 def __init__(
     self,
     vcf_paths: list[Path],
     min_maf: Optional[float],
     include_missing_mafs: bool,
     cached: bool = True,
 ):
+    if not vcf_paths:
+        raise ValueError("No VCF paths provided")
+    if any(not path.exists() for path in vcf_paths):
+        raise ValueError("One or more VCF paths do not exist")
     self.vcf_paths: list[Path] = vcf_paths
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
class VariantLookup(ContextManager):
def __init__(
self,
vcf_paths: list[Path],
min_maf: Optional[float],
include_missing_mafs: bool,
cached: bool = True,
):
self.vcf_paths: list[Path] = vcf_paths
self.min_maf: Optional[float] = min_maf
self.include_missing_mafs: bool = include_missing_mafs
self.cached: bool = cached
self._lookup: _VariantLookup
if cached:
self._lookup = _InMemoryLookup(
vcf_paths=vcf_paths, min_maf=min_maf, include_missing_mafs=include_missing_mafs
)
else:
self._lookup = _DiskBasedLookup(
vcf_paths=vcf_paths, min_maf=min_maf, include_missing_mafs=include_missing_mafs
)
class VariantLookup(ContextManager):
def __init__(
self,
vcf_paths: list[Path],
min_maf: Optional[float],
include_missing_mafs: bool,
cached: bool = True,
):
if not vcf_paths:
raise ValueError("No VCF paths provided")
if any(not path.exists() for path in vcf_paths):
raise ValueError("One or more VCF paths do not exist")
self.vcf_paths: list[Path] = vcf_paths
self.min_maf: Optional[float] = min_maf
self.include_missing_mafs: bool = include_missing_mafs
self.cached: bool = cached
self._lookup: _VariantLookup
if cached:
self._lookup = _InMemoryLookup(
vcf_paths=vcf_paths, min_maf=min_maf, include_missing_mafs=include_missing_mafs
)
else:
self._lookup = _DiskBasedLookup(
vcf_paths=vcf_paths, min_maf=min_maf, include_missing_mafs=include_missing_mafs
)

def __enter__(self) -> "VariantLookup":
return self

def __exit__(
self,
exc_type: Optional[type[BaseException]],
exc_value: Optional[BaseException],
traceback: Optional[TracebackType],
) -> None:
self._lookup.close()

def query(
self,
refname: str,
start: int,
end: int,
maf: Optional[float] = None,
include_missing_mafs: bool = None,
) -> list[SimpleVariant]:
maf = maf if maf is not None else self.min_maf
include_missing_mafs = (
include_missing_mafs if include_missing_mafs is not None else self.include_missing_mafs
)
variants: list[SimpleVariant] = self._lookup.query(
refname=refname,
start=start,
end=end,
)

if len(variants) == 0:
_logger.debug(f"No variants extracted from region of interest: {refname}:{start}-{end}")
if maf is None or maf <= 0.0:
return variants
elif include_missing_mafs: # return variants with a MAF above threshold or missing
return [v for v in variants if (v.maf is None or v.maf >= maf)]
else:
return [v for v in variants if v.maf is not None and v.maf >= maf]

def close(self) -> None:
"""Close the underlying VCF file handles."""
self._lookup.close()

Check warning on line 368 in prymer/api/variant_lookup.py

View check run for this annotation

Codecov / codecov/patch

prymer/api/variant_lookup.py#L368

Added line #L368 was not covered by tests


class FileBasedVariantLookup(ContextManager, VariantLookup):
class _DiskBasedLookup(_VariantLookup):
"""Implementation of `VariantLookup` that queries against indexed VCF files each time a query is
performed. Assumes the index is located adjacent to the VCF file and has the same base name with
either a .csi or .tbi suffix.

Example:

```python
>>> with FileBasedVariantLookup([Path("./tests/api/data/miniref.variants.vcf.gz")], min_maf=0.0, include_missing_mafs=False) as lookup:
>>> with _DiskBasedLookup([Path("./tests/api/data/miniref.variants.vcf.gz")], min_maf=0.0, include_missing_mafs=False) as lookup:
... lookup.query(refname="chr2", start=7999, end=8000)
[SimpleVariant(id='complex-variant-sv-1/1', refname='chr2', pos=8000, ref='T', alt='<DEL>', end=8000, variant_type=<VariantType.OTHER: 'OTHER'>, maf=None)]

Expand All @@ -355,7 +399,7 @@
open_fh = pysam.VariantFile(str(path))
self._readers.append(open_fh)

def __enter__(self) -> "FileBasedVariantLookup":
def __enter__(self) -> "_DiskBasedLookup":
"""Enter the context manager."""
return self

Expand All @@ -369,7 +413,14 @@
self.close()
return None

def _query(self, refname: str, start: int, end: int) -> list[SimpleVariant]:
def query(
self,
refname: str,
start: int,
end: int,
maf: Optional[float] = None,
include_missing_mafs: bool = None,
) -> list[SimpleVariant]:
"""Queries variants from the VCFs used by this lookup and returns a `SimpleVariant`."""
simple_variants: list[SimpleVariant] = []
for fh, path in zip(self._readers, self.vcf_paths, strict=True):
Expand All @@ -387,7 +438,7 @@
handle.close()


class VariantOverlapDetector(VariantLookup):
class _InMemoryLookup(_VariantLookup):
"""Implements `VariantLookup` by reading the entire VCF into memory and loading the resulting
Variants into an `OverlapDetector`."""

Expand Down Expand Up @@ -415,7 +466,14 @@
)
self._overlap_detector.add_all(variant_intervals)

def _query(self, refname: str, start: int, end: int) -> list[SimpleVariant]:
def query(
self,
refname: str,
start: int,
end: int,
maf: Optional[float] = None,
include_missing_mafs: bool = None,
) -> list[SimpleVariant]:
Comment on lines +469 to +476
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue

Add chromosome validation in _InMemoryLookup.query.

Match chromosome validation behavior with _DiskBasedLookup.

 def query(
     self,
     refname: str,
     start: int,
     end: int,
     maf: Optional[float] = None,
     include_missing_mafs: bool = None,
 ) -> list[SimpleVariant]:
+    if not any(refname in self._overlap_detector.intervals_by_refname):
+        _logger.debug(f"No variants found for chromosome {refname}.")
+        return []
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
def query(
self,
refname: str,
start: int,
end: int,
maf: Optional[float] = None,
include_missing_mafs: bool = None,
) -> list[SimpleVariant]:
def query(
self,
refname: str,
start: int,
end: int,
maf: Optional[float] = None,
include_missing_mafs: bool = None,
) -> list[SimpleVariant]:
if not any(refname in self._overlap_detector.intervals_by_refname):
_logger.debug(f"No variants found for chromosome {refname}.")
return []

"""Queries variants from the VCFs used by this lookup."""
query = Interval(
refname=refname, start=start - 1, end=end
Expand All @@ -426,6 +484,9 @@
)
return sorted(overlapping_variants, key=lambda v: (v.pos, v.id))

def close(self) -> None:
pass

Check warning on line 488 in prymer/api/variant_lookup.py

View check run for this annotation

Codecov / codecov/patch

prymer/api/variant_lookup.py#L488

Added line #L488 was not covered by tests


# module-level functions
def calc_maf_from_filter(variant: pysam.VariantRecord) -> Optional[float]:
Expand Down Expand Up @@ -460,34 +521,3 @@
maf = num_alt / len(gts)

return maf


def cached(
vcf_paths: list[Path], min_maf: float, include_missing_mafs: bool = False
) -> VariantOverlapDetector:
"""Constructs a `VariantLookup` that caches all variants in memory for fast lookup.
Appropriate for small VCFs."""
return VariantOverlapDetector(
vcf_paths=vcf_paths, min_maf=min_maf, include_missing_mafs=include_missing_mafs
)


def disk_based(
vcf_paths: list[Path], min_maf: float, include_missing_mafs: bool = False
) -> FileBasedVariantLookup:
"""Constructs a `VariantLookup` that queries indexed VCFs on disk for each lookup.

Appropriate for large VCFs.

Example:

```python
>>> with disk_based([Path("./tests/api/data/miniref.variants.vcf.gz")], min_maf=0.0) as lookup:
... lookup.query(refname="chr2", start=7999, end=8000)
[SimpleVariant(id='complex-variant-sv-1/1', refname='chr2', pos=8000, ref='T', alt='<DEL>', end=8000, variant_type=<VariantType.OTHER: 'OTHER'>, maf=None)]

```
""" # noqa: E501
return FileBasedVariantLookup(
vcf_paths=vcf_paths, min_maf=min_maf, include_missing_mafs=include_missing_mafs
)
37 changes: 28 additions & 9 deletions prymer/primer3/primer3.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,15 +17,15 @@
nucleotide sequences can be retrieved. The full path to the `primer3` executable can provided,
otherwise it is assumed to be on the PATH. Furthermore, optionally a
[`VariantLookup`][prymer.api.variant_lookup.VariantLookup] may be provided to
hard-mask the design and target regions as to avoid design primers over polymorphic sites.
hard-mask the design and target regions so as to avoid design primers over polymorphic sites.

```python
>>> from pathlib import Path
>>> from prymer.api.variant_lookup import VariantLookup, VariantOverlapDetector
>>> from prymer.api.variant_lookup import _VariantLookup, _InMemoryLookup
>>> genome_fasta = Path("./tests/primer3/data/miniref.fa")
>>> genome_vcf = Path("./tests/primer3/data/miniref.variants.vcf.gz")
>>> variant_lookup: VariantLookup = VariantOverlapDetector(vcf_paths=[genome_vcf], min_maf=0.01, include_missing_mafs=False)
>>> designer = Primer3(genome_fasta=genome_fasta, variant_lookup=variant_lookup)
>>> variant_lookup: VariantLookup = VariantLookup(vcf_paths=[genome_vcf], min_maf=0.01, include_missing_mafs=False)
>>> designer = Primer3(genome_fasta=genome_fasta, variants=variant_lookup)

```

Expand Down Expand Up @@ -229,13 +229,14 @@
self,
genome_fasta: Path,
executable: Optional[str] = None,
variant_lookup: Optional[VariantLookup] = None,
variants: list[Path] | VariantLookup | None = None,
) -> None:
"""
Args:
genome_fasta: Path to reference genome .fasta file
executable: string representation of the path to primer3_core
variant_lookup: VariantLookup object to facilitate hard-masking variants
variants: an optional list of VCF `Path`s or VariantLookup object to facilitate
hard-masking variants

Assumes the sequence dictionary is located adjacent to the .fasta file and has the same
base name with a .dict suffix.
Expand All @@ -246,12 +247,30 @@
)
command: list[str] = [f"{executable_path}"]

self.variant_lookup = variant_lookup
self.variant_lookup: Optional[VariantLookup]
# If no variants are given, or they are in a pre-constructed `VariantLookup` object,
# set the object attribute
if variants is None or isinstance(variants, VariantLookup):
self.variant_lookup = variants
# If `variants` is a list of `Path`s to indexed VCF files, create a `VariantLookup` object
# with default settings (no MAF filtering, do not include variants with missing MAFs)
elif isinstance(variants, list) and all(
isinstance(path, Path) for path in variants
): # if provided path, create appropriate `VariantLookup`
self.variant_lookup = VariantLookup(

Check warning on line 260 in prymer/primer3/primer3.py

View check run for this annotation

Codecov / codecov/patch

prymer/primer3/primer3.py#L260

Added line #L260 was not covered by tests
vcf_paths=variants,
cached=True,
min_maf=0.00,
include_missing_mafs=False,
)
else:
raise ValueError(
"Variant lookup is required to be a list of `Path` objects or a"
f" pre-constructed `VariantLookup` object: received {variants}"
)
self._fasta = pysam.FastaFile(filename=f"{genome_fasta}")

dict_path = genome_fasta.with_suffix(".dict")
# TODO: This is a placeholder while waiting for #160 to be resolved
# https://github.com/fulcrumgenomics/fgpyo/pull/160
with reader(dict_path, file_type=sam.SamFileType.SAM) as fh:
self._dict: SequenceDictionary = SequenceDictionary.from_sam(header=fh.header)

Expand Down
Loading
Loading