Merge pull request #10 from ChrisgKent/dev

Dev
artic-network · Oct 31, 2024 · 0e8096a · 0e8096a
2 parents 81bbbe8 + 6b10a36
commit 0e8096a
Show file tree

Hide file tree

Showing 17 changed files with 591 additions and 689 deletions.
diff --git a/poetry.lock b/poetry.lock
diff --git a/primalscheme3/__init__.py b/primalscheme3/__init__.py
@@ -1 +1 @@
-__version__ = "1.3.1"
+__version__ = "1.4.0"
diff --git a/primalscheme3/cli.py b/primalscheme3/cli.py
@@ -10,9 +10,11 @@
 from primalscheme3.__init__ import __version__
 from primalscheme3.core.config import Config, MappingType
 from primalscheme3.core.msa import parse_msa
-from primalscheme3.core.primer_visual import primer_mismatch_heatmap
+from primalscheme3.core.primer_visual import bedfile_plot_html, primer_mismatch_heatmap
 from primalscheme3.core.progress_tracker import ProgressManager
-from primalscheme3.interaction.interaction import visualise_interactions
+from primalscheme3.interaction.interaction import (
+    visualise_interactions,
+)
 from primalscheme3.panel.panel_main import PanelRunModes, panelcreate
 from primalscheme3.remap.remap import remap
 from primalscheme3.repair.repair import repair
@@ -398,7 +400,6 @@ def repair_mode(
     pm = ProgressManager()
 
     repair(
-        cores=1,
         config_path=config,
         bedfile_path=bedfile,
         force=force,
@@ -501,5 +502,51 @@ def visualise_primer_mismatches(
         )
 
 
+@app.command(no_args_is_help=True)
+def visualise_bedfile(
+    bedfile: Annotated[
+        pathlib.Path,
+        typer.Argument(
+            help="The bedfile containing the primers",
+            readable=True,
+            exists=True,
+            dir_okay=False,
+            resolve_path=True,
+        ),
+    ],
+    ref_id: Annotated[str, typer.Option(help="The reference genome ID")],
+    ref_path: Annotated[
+        pathlib.Path,
+        typer.Argument(
+            help="The bedfile containing the primers",
+            readable=True,
+            exists=True,
+            dir_okay=False,
+            resolve_path=True,
+        ),
+    ],
+    output: Annotated[
+        pathlib.Path,
+        typer.Option(help="Output location of the plot", dir_okay=False, writable=True),
+    ] = pathlib.Path("bedfile.html"),
+):
+    """
+    Visualise the bedfile
+    """
+    from Bio import SeqIO
+
+    ref_file = SeqIO.index(ref_path, "fasta")
+    ref_genome = ref_file.get(ref_id)
+    if ref_genome is None:
+        raise typer.BadParameter(
+            f"Reference genome ID '{ref_id}' not found in '{ref_path}'"
+        )
+
+    with open(output, "w") as outfile:
+        outfile.write(
+            bedfile_plot_html(bedfile=bedfile, ref_name=ref_id, ref_seq=ref_genome)
+        )
+
+
 if __name__ == "__main__":
     app()
diff --git a/primalscheme3/core/bedfiles.py b/primalscheme3/core/bedfiles.py
@@ -105,7 +105,7 @@ def end(self) -> int:
         return self._end
 
     def __str__(self, *kwargs) -> str:
-        # I use *kwargs so that it can have the same behavor as PrimerPairs
+        # I use *kwargs so that it can have the same behaviour as PrimerPairs
         return f"{self.chrom_name}\t{self.start}\t{self.end}\t{self.primername}\t{self.pool + 1}\t{self.direction}\t{self.sequence}"
 
 
@@ -147,7 +147,7 @@ def read_in_bedprimerpairs(path: pathlib.Path) -> tuple[list[BedPrimerPair], lis
     primerpairs = []
     primerlines, _headers = read_in_bedlines(path)  # Ignore headers for now
 
-    # Group primers by referance
+    # Group primers by reference
     ref_to_bedlines: dict[str, list[BedLine]] = dict()
     for ref in {bedline.chrom_name for bedline in primerlines}:
         ref_to_bedlines[ref] = [x for x in primerlines if x.chrom_name == ref]
@@ -162,14 +162,26 @@ def read_in_bedprimerpairs(path: pathlib.Path) -> tuple[list[BedPrimerPair], lis
                 x for x in ref_bed_lines if x.amplicon_number == ampliconnumber
             ]
             pool = ampliconlines[0].pool
+
+            fp = [x for x in ampliconlines if x.direction == "+"]
+            rp = [x for x in ampliconlines if x.direction == "-"]
+
+            if len(fp) == 0:
+                raise ValueError(
+                    f"Primer {ampliconlines[0].primername} has no forward primer"
+                )
+            if len(rp) == 0:
+                raise ValueError(
+                    f"Primer {ampliconlines[0].primername} has no reverse primer"
+                )
             # Group the ampliconlines by direction
             fkmer = FKmer(
-                [x.end for x in ampliconlines if x.direction == "+"][0],
-                [x.sequence for x in ampliconlines if x.direction == "+"],
+                max([x.end for x in fp]),
+                [x.sequence for x in fp],
             )
             rkmer = RKmer(
-                [x.start for x in ampliconlines if x.direction == "-"][0],
-                [x.sequence for x in ampliconlines if x.direction == "-"],
+                min([x.start for x in rp]),
+                [x.sequence for x in rp],
             )
             primerpairs.append(
                 BedPrimerPair(

diff --git a/primalscheme3/core/config.py b/primalscheme3/core/config.py
@@ -5,7 +5,7 @@
 from primalscheme3 import __version__
 
 
-# Writen by Andy Smith, modified by: Chris Kent
+# Written by Andy Smith, modified by: Chris Kent
 class MappingType(Enum):
     """
     Enum for the mapping type
@@ -94,9 +94,17 @@ def items(self) -> dict[str, Any]:
         Return a dict (key, val) for non-private, non-callable members
         """
         items = {}
-        for key, val in self.__dict__.items():
-            if not (callable(getattr(self, key)) or key.startswith("_")):
-                items[key] = val
+        for key in [x for x in dir(self) if not x.startswith("_")]:
+            if not callable(getattr(self, key)):  # prevent functions
+                value = getattr(self, key)
+                # Convert Enum and Path objects to their values
+                if isinstance(value, Enum):
+                    items[key] = value.value
+                if isinstance(value, pathlib.Path):
+                    items[key] = str(value)
+                else:
+                    items[key] = value
+
         return items
 
     def to_json(self) -> dict[str, Any]:

diff --git a/primalscheme3/core/digestion.py b/primalscheme3/core/digestion.py
@@ -1,12 +1,9 @@
 # Modules
-import itertools
 from collections import Counter
 from enum import Enum
 from typing import Callable, Union
 
-import networkx as nx
 import numpy as np
-from loguru._logger import Logger
 
 # Submodules
 from primaldimer_py import (
@@ -104,7 +101,7 @@ def parse_thermo_error(result: THERMORESULT) -> DIGESTION_ERROR:
         case THERMORESULT.HAIRPIN:
             return DIGESTION_ERROR.THERMO_HAIRPIN
         case _:
-            raise ValueError("Unknown error occured")
+            raise ValueError("Unknown error occurred")
 
 
 def parse_error_list(
@@ -604,14 +601,6 @@ def f_digest_index(
     else:
         raise ValueError("Unknown error occurred")
 
-    # # DownSample the seqs if asked
-    # if cfg["reducekmers"]:
-    #     wanted_seqs = reduce_kmers(
-    #         seqs={*parsed_seqs.keys()},
-    #         max_edit_dist=cfg["editdist_max"],
-    #         end_3p=cfg["editdist_end3p"],
-    #     )
-
     # Thermo check the kmers
     thermo_result = thermo_check_kmers({*parsed_seqs.keys()}, config)
     match thermo_result:
@@ -638,108 +627,6 @@ def hamming_dist(s1, s2) -> int:
     return sum((x != y for x, y in zip(s1[::-1], s2[::-1])))
 
 
-def reduce_kmers(seqs: set[str], max_edit_dist: int = 1, end_3p: int = 6) -> set[str]:
-    """
-    Reduces a set of DNA sequences by clustering them based on their 3' end, and then minimizing the edit distance between
-    all tails within the same 3' cluster. The resulting set of sequences will have at most `max_edit_dist` differences
-    between any two sequences, and will all have a common 3' end of length `end_3p`.
-
-    Args:
-        seqs: A set of DNA sequences to be reduced.
-        max_edit_dist: The maximum edit distance allowed between any two sequences in the same 3' cluster. Defaults to 1.
-        end_3p: The length of the 3' end to use for clustering. Defaults to 6.
-
-    Returns:
-        A set of reduced DNA sequences, where each sequence has a common 3' end of length `end_3p`, and at most
-        `max_edit_dist` differences between any two sequences.
-    """
-    ## Cluster sequences via the 3p end
-    p3_end_dict: dict[str, set[str]] = {}
-    for sequence in seqs:
-        p3_end = sequence[-end_3p:]
-        p5_tail = sequence[:-end_3p]
-        if p3_end in p3_end_dict:
-            p3_end_dict[p3_end].add(p5_tail)
-        else:
-            p3_end_dict[p3_end] = {p5_tail}
-
-    ## Minimise edit distance between all tails within the same p3 cluster
-    for p3_end, p5_tails in p3_end_dict.items():
-        # If only one sequence skip
-        if len(p5_tails) <= 1:
-            continue
-
-        # Create a linkage graph
-        G = nx.Graph()
-        G.add_nodes_from(p5_tails)
-        for s1, s2 in itertools.combinations(p5_tails, 2):
-            if hamming_dist(s1, s2) <= max_edit_dist:
-                # Add edges if the distance is <= hamming dist max
-                G.add_edge(s1, s2)
-
-        # Find the most connected sequence
-        sorted_sequences = sorted(
-            p5_tails, key=lambda seq: (len(list(G.neighbors(seq))), seq), reverse=True
-        )
-
-        # Seqs which are included in the scheme
-        included_seqs = set()
-        # Seqs which have a closely related sequence included
-        accounted_seqs = set()
-
-        for sequence in sorted_sequences:
-            # If the sequence is not accounted for and not included
-            if sequence not in accounted_seqs and sequence not in included_seqs:
-                included_seqs.add(sequence)
-                # Add all the neighbors into accounted seqs
-                for neighbors in G.neighbors(sequence):
-                    accounted_seqs.add(neighbors)
-
-        # Update the p3_end_dict to contain the downsampled tails
-        p3_end_dict[p3_end] = included_seqs
-
-    seqs = set()
-    ## Regenerate all the sequences from p3_end_dict
-    for k, v in p3_end_dict.items():
-        for seq in v:
-            seqs.add(f"{seq}{k}")
-    return seqs
-
-
-def concurrent_digest(
-    msa_array: np.ndarray,
-    config: Config,
-    findexes: list[int],
-    rindexes: list[int],
-) -> tuple[list[FKmer], list[RKmer]]:
-    """
-    Carries out the FKmer and RKmer digestion in parallel.
-    """
-    import multiprocessing as mp
-
-    q = mp.Queue()
-    jobs = (f_digest, r_digest)
-    args = (
-        (
-            msa_array.copy(),
-            config,
-            findexes,
-            None,
-        ),
-        (
-            msa_array.copy(),
-            config,
-            rindexes,
-            None,
-        ),
-    )
-    for job, arg in zip(jobs, args):
-        p = mp.Process(target=job, args=(arg, q))
-        p.start()
-
-    return q.get(), q.get()
-
-
 def f_digest(
     msa_array: np.ndarray, config: Config, findexes: list[int], logger
 ) -> list[FKmer]:
@@ -786,7 +673,7 @@ def digest(
     config: Config,
     progress_manager: ProgressManager,
     indexes: tuple[list[int], list[int]] | None = None,
-    logger: None | Logger = None,
+    logger=None,
     chrom: str = "",
 ) -> tuple[list[FKmer], list[RKmer]]:
     """
@@ -795,12 +682,12 @@ def digest(
     :param msa_array: The input MSA array.
     :param cfg: A dictionary containing configuration parameters.
     :param indexes: A tuple of MSA indexes for (FKmers, RKmers), or None to use all indexes.
-    :param logger: None or the Logguru logger object.
+    :param logger: None or the logger object.
     :return: A tuple containing lists of sorted FKmers and RKmers.
     """
     # Guard for invalid indexes
     if indexes is not None:
-        if min(indexes[0]) < 0 or max(indexes[1]) >= msa_array.shape[1]:
+        if min(indexes[0]) < 0 or max(indexes[0]) >= msa_array.shape[1]:
             raise IndexError("FIndexes are out of range")
         if min(indexes[1]) < 0 or max(indexes[1]) >= msa_array.shape[1]:
             raise IndexError("RIndexes are out of range")
@@ -827,9 +714,9 @@ def digest(
 
         if logger is not None:
             if isinstance(fkmer, tuple):
-                logger.debug(f"{chrom}:FKmer: [red]{fkmer[0]}\t{fkmer[1].value}[/red]")
+                logger.debug(f"{chrom}:FKmer: {fkmer[0]}\t{fkmer[1].value}")
             else:
-                logger.debug(f"{chrom}:FKmer: [green]{fkmer.end}[/green]: AllPass")
+                logger.debug(f"{chrom}:FKmer: {fkmer.end}\tPass")
 
         # Append valid FKmers
         if isinstance(fkmer, FKmer) and fkmer.seqs:
@@ -848,9 +735,9 @@ def digest(
 
         if logger is not None:
             if isinstance(rkmer, tuple):
-                logger.debug(f"{chrom}:RKmer: [red]{rkmer[0]}\t{rkmer[1].value}[/red]")
+                logger.debug(f"{chrom}:RKmer: {rkmer[0]}\t{rkmer[1].value}")
             else:
-                logger.debug(f"{chrom}:RKmer: [green]{rkmer.start}[/green]: AllPass")
+                logger.debug(f"{chrom}:RKmer: {rkmer.start}\tPass")
 
         # Append valid RKmers
         if isinstance(rkmer, RKmer) and rkmer.seqs: