Skip to content

Commit

Permalink
Merge pull request #10 from ChrisgKent/dev
Browse files Browse the repository at this point in the history
Dev
  • Loading branch information
ChrisgKent authored Oct 31, 2024
2 parents 81bbbe8 + 6b10a36 commit 0e8096a
Show file tree
Hide file tree
Showing 17 changed files with 591 additions and 689 deletions.
287 changes: 144 additions & 143 deletions poetry.lock

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion primalscheme3/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "1.3.1"
__version__ = "1.4.0"
53 changes: 50 additions & 3 deletions primalscheme3/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,11 @@
from primalscheme3.__init__ import __version__
from primalscheme3.core.config import Config, MappingType
from primalscheme3.core.msa import parse_msa
from primalscheme3.core.primer_visual import primer_mismatch_heatmap
from primalscheme3.core.primer_visual import bedfile_plot_html, primer_mismatch_heatmap
from primalscheme3.core.progress_tracker import ProgressManager
from primalscheme3.interaction.interaction import visualise_interactions
from primalscheme3.interaction.interaction import (
visualise_interactions,
)
from primalscheme3.panel.panel_main import PanelRunModes, panelcreate
from primalscheme3.remap.remap import remap
from primalscheme3.repair.repair import repair
Expand Down Expand Up @@ -398,7 +400,6 @@ def repair_mode(
pm = ProgressManager()

repair(
cores=1,
config_path=config,
bedfile_path=bedfile,
force=force,
Expand Down Expand Up @@ -501,5 +502,51 @@ def visualise_primer_mismatches(
)


@app.command(no_args_is_help=True)
def visualise_bedfile(
bedfile: Annotated[
pathlib.Path,
typer.Argument(
help="The bedfile containing the primers",
readable=True,
exists=True,
dir_okay=False,
resolve_path=True,
),
],
ref_id: Annotated[str, typer.Option(help="The reference genome ID")],
ref_path: Annotated[
pathlib.Path,
typer.Argument(
help="The bedfile containing the primers",
readable=True,
exists=True,
dir_okay=False,
resolve_path=True,
),
],
output: Annotated[
pathlib.Path,
typer.Option(help="Output location of the plot", dir_okay=False, writable=True),
] = pathlib.Path("bedfile.html"),
):
"""
Visualise the bedfile
"""
from Bio import SeqIO

ref_file = SeqIO.index(ref_path, "fasta")
ref_genome = ref_file.get(ref_id)
if ref_genome is None:
raise typer.BadParameter(
f"Reference genome ID '{ref_id}' not found in '{ref_path}'"
)

with open(output, "w") as outfile:
outfile.write(
bedfile_plot_html(bedfile=bedfile, ref_name=ref_id, ref_seq=ref_genome)
)


if __name__ == "__main__":
app()
24 changes: 18 additions & 6 deletions primalscheme3/core/bedfiles.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ def end(self) -> int:
return self._end

def __str__(self, *kwargs) -> str:
# I use *kwargs so that it can have the same behavor as PrimerPairs
# I use *kwargs so that it can have the same behaviour as PrimerPairs
return f"{self.chrom_name}\t{self.start}\t{self.end}\t{self.primername}\t{self.pool + 1}\t{self.direction}\t{self.sequence}"


Expand Down Expand Up @@ -147,7 +147,7 @@ def read_in_bedprimerpairs(path: pathlib.Path) -> tuple[list[BedPrimerPair], lis
primerpairs = []
primerlines, _headers = read_in_bedlines(path) # Ignore headers for now

# Group primers by referance
# Group primers by reference
ref_to_bedlines: dict[str, list[BedLine]] = dict()
for ref in {bedline.chrom_name for bedline in primerlines}:
ref_to_bedlines[ref] = [x for x in primerlines if x.chrom_name == ref]
Expand All @@ -162,14 +162,26 @@ def read_in_bedprimerpairs(path: pathlib.Path) -> tuple[list[BedPrimerPair], lis
x for x in ref_bed_lines if x.amplicon_number == ampliconnumber
]
pool = ampliconlines[0].pool

fp = [x for x in ampliconlines if x.direction == "+"]
rp = [x for x in ampliconlines if x.direction == "-"]

if len(fp) == 0:
raise ValueError(
f"Primer {ampliconlines[0].primername} has no forward primer"
)
if len(rp) == 0:
raise ValueError(
f"Primer {ampliconlines[0].primername} has no reverse primer"
)
# Group the ampliconlines by direction
fkmer = FKmer(
[x.end for x in ampliconlines if x.direction == "+"][0],
[x.sequence for x in ampliconlines if x.direction == "+"],
max([x.end for x in fp]),
[x.sequence for x in fp],
)
rkmer = RKmer(
[x.start for x in ampliconlines if x.direction == "-"][0],
[x.sequence for x in ampliconlines if x.direction == "-"],
min([x.start for x in rp]),
[x.sequence for x in rp],
)
primerpairs.append(
BedPrimerPair(
Expand Down
16 changes: 12 additions & 4 deletions primalscheme3/core/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from primalscheme3 import __version__


# Writen by Andy Smith, modified by: Chris Kent
# Written by Andy Smith, modified by: Chris Kent
class MappingType(Enum):
"""
Enum for the mapping type
Expand Down Expand Up @@ -94,9 +94,17 @@ def items(self) -> dict[str, Any]:
Return a dict (key, val) for non-private, non-callable members
"""
items = {}
for key, val in self.__dict__.items():
if not (callable(getattr(self, key)) or key.startswith("_")):
items[key] = val
for key in [x for x in dir(self) if not x.startswith("_")]:
if not callable(getattr(self, key)): # prevent functions
value = getattr(self, key)
# Convert Enum and Path objects to their values
if isinstance(value, Enum):
items[key] = value.value
if isinstance(value, pathlib.Path):
items[key] = str(value)
else:
items[key] = value

return items

def to_json(self) -> dict[str, Any]:
Expand Down
129 changes: 8 additions & 121 deletions primalscheme3/core/digestion.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,9 @@
# Modules
import itertools
from collections import Counter
from enum import Enum
from typing import Callable, Union

import networkx as nx
import numpy as np
from loguru._logger import Logger

# Submodules
from primaldimer_py import (
Expand Down Expand Up @@ -104,7 +101,7 @@ def parse_thermo_error(result: THERMORESULT) -> DIGESTION_ERROR:
case THERMORESULT.HAIRPIN:
return DIGESTION_ERROR.THERMO_HAIRPIN
case _:
raise ValueError("Unknown error occured")
raise ValueError("Unknown error occurred")


def parse_error_list(
Expand Down Expand Up @@ -604,14 +601,6 @@ def f_digest_index(
else:
raise ValueError("Unknown error occurred")

# # DownSample the seqs if asked
# if cfg["reducekmers"]:
# wanted_seqs = reduce_kmers(
# seqs={*parsed_seqs.keys()},
# max_edit_dist=cfg["editdist_max"],
# end_3p=cfg["editdist_end3p"],
# )

# Thermo check the kmers
thermo_result = thermo_check_kmers({*parsed_seqs.keys()}, config)
match thermo_result:
Expand All @@ -638,108 +627,6 @@ def hamming_dist(s1, s2) -> int:
return sum((x != y for x, y in zip(s1[::-1], s2[::-1])))


def reduce_kmers(seqs: set[str], max_edit_dist: int = 1, end_3p: int = 6) -> set[str]:
"""
Reduces a set of DNA sequences by clustering them based on their 3' end, and then minimizing the edit distance between
all tails within the same 3' cluster. The resulting set of sequences will have at most `max_edit_dist` differences
between any two sequences, and will all have a common 3' end of length `end_3p`.
Args:
seqs: A set of DNA sequences to be reduced.
max_edit_dist: The maximum edit distance allowed between any two sequences in the same 3' cluster. Defaults to 1.
end_3p: The length of the 3' end to use for clustering. Defaults to 6.
Returns:
A set of reduced DNA sequences, where each sequence has a common 3' end of length `end_3p`, and at most
`max_edit_dist` differences between any two sequences.
"""
## Cluster sequences via the 3p end
p3_end_dict: dict[str, set[str]] = {}
for sequence in seqs:
p3_end = sequence[-end_3p:]
p5_tail = sequence[:-end_3p]
if p3_end in p3_end_dict:
p3_end_dict[p3_end].add(p5_tail)
else:
p3_end_dict[p3_end] = {p5_tail}

## Minimise edit distance between all tails within the same p3 cluster
for p3_end, p5_tails in p3_end_dict.items():
# If only one sequence skip
if len(p5_tails) <= 1:
continue

# Create a linkage graph
G = nx.Graph()
G.add_nodes_from(p5_tails)
for s1, s2 in itertools.combinations(p5_tails, 2):
if hamming_dist(s1, s2) <= max_edit_dist:
# Add edges if the distance is <= hamming dist max
G.add_edge(s1, s2)

# Find the most connected sequence
sorted_sequences = sorted(
p5_tails, key=lambda seq: (len(list(G.neighbors(seq))), seq), reverse=True
)

# Seqs which are included in the scheme
included_seqs = set()
# Seqs which have a closely related sequence included
accounted_seqs = set()

for sequence in sorted_sequences:
# If the sequence is not accounted for and not included
if sequence not in accounted_seqs and sequence not in included_seqs:
included_seqs.add(sequence)
# Add all the neighbors into accounted seqs
for neighbors in G.neighbors(sequence):
accounted_seqs.add(neighbors)

# Update the p3_end_dict to contain the downsampled tails
p3_end_dict[p3_end] = included_seqs

seqs = set()
## Regenerate all the sequences from p3_end_dict
for k, v in p3_end_dict.items():
for seq in v:
seqs.add(f"{seq}{k}")
return seqs


def concurrent_digest(
msa_array: np.ndarray,
config: Config,
findexes: list[int],
rindexes: list[int],
) -> tuple[list[FKmer], list[RKmer]]:
"""
Carries out the FKmer and RKmer digestion in parallel.
"""
import multiprocessing as mp

q = mp.Queue()
jobs = (f_digest, r_digest)
args = (
(
msa_array.copy(),
config,
findexes,
None,
),
(
msa_array.copy(),
config,
rindexes,
None,
),
)
for job, arg in zip(jobs, args):
p = mp.Process(target=job, args=(arg, q))
p.start()

return q.get(), q.get()


def f_digest(
msa_array: np.ndarray, config: Config, findexes: list[int], logger
) -> list[FKmer]:
Expand Down Expand Up @@ -786,7 +673,7 @@ def digest(
config: Config,
progress_manager: ProgressManager,
indexes: tuple[list[int], list[int]] | None = None,
logger: None | Logger = None,
logger=None,
chrom: str = "",
) -> tuple[list[FKmer], list[RKmer]]:
"""
Expand All @@ -795,12 +682,12 @@ def digest(
:param msa_array: The input MSA array.
:param cfg: A dictionary containing configuration parameters.
:param indexes: A tuple of MSA indexes for (FKmers, RKmers), or None to use all indexes.
:param logger: None or the Logguru logger object.
:param logger: None or the logger object.
:return: A tuple containing lists of sorted FKmers and RKmers.
"""
# Guard for invalid indexes
if indexes is not None:
if min(indexes[0]) < 0 or max(indexes[1]) >= msa_array.shape[1]:
if min(indexes[0]) < 0 or max(indexes[0]) >= msa_array.shape[1]:
raise IndexError("FIndexes are out of range")
if min(indexes[1]) < 0 or max(indexes[1]) >= msa_array.shape[1]:
raise IndexError("RIndexes are out of range")
Expand All @@ -827,9 +714,9 @@ def digest(

if logger is not None:
if isinstance(fkmer, tuple):
logger.debug(f"{chrom}:FKmer: [red]{fkmer[0]}\t{fkmer[1].value}[/red]")
logger.debug(f"{chrom}:FKmer: {fkmer[0]}\t{fkmer[1].value}")
else:
logger.debug(f"{chrom}:FKmer: [green]{fkmer.end}[/green]: AllPass")
logger.debug(f"{chrom}:FKmer: {fkmer.end}\tPass")

# Append valid FKmers
if isinstance(fkmer, FKmer) and fkmer.seqs:
Expand All @@ -848,9 +735,9 @@ def digest(

if logger is not None:
if isinstance(rkmer, tuple):
logger.debug(f"{chrom}:RKmer: [red]{rkmer[0]}\t{rkmer[1].value}[/red]")
logger.debug(f"{chrom}:RKmer: {rkmer[0]}\t{rkmer[1].value}")
else:
logger.debug(f"{chrom}:RKmer: [green]{rkmer.start}[/green]: AllPass")
logger.debug(f"{chrom}:RKmer: {rkmer.start}\tPass")

# Append valid RKmers
if isinstance(rkmer, RKmer) and rkmer.seqs:
Expand Down
Loading

0 comments on commit 0e8096a

Please sign in to comment.