From 94cc65daf89419340da35215b1f689245e155471 Mon Sep 17 00:00:00 2001
From: Sam W <s.a.j.wilkinson@bham.ac.uk>
Date: Mon, 6 Jan 2025 16:41:23 +0000
Subject: [PATCH] Add support for segmented genomes / amplicon panels (#146)

* First test for multiref

* Multiref changes

* Fix align trim unit test

* fix rehead fasta
---
 artic/align_trim.py           | 130 ++++++++++++++++++----------------
 artic/fasta_header.py         |  25 ++++---
 artic/minion.py               |  13 ++--
 artic/utils.py                |   8 +--
 tests/align_trim_unit_test.py |  53 ++++++++++----
 5 files changed, 133 insertions(+), 96 deletions(-)

diff --git a/artic/align_trim.py b/artic/align_trim.py
index 3c31b030..882b68e1 100755
--- a/artic/align_trim.py
+++ b/artic/align_trim.py
@@ -16,7 +16,7 @@
 consumesQuery = [True, True, False, False, True, False, False, True]
 
 
-def find_primer(bed, pos, direction, threshold=20):
+def find_primer(bed, pos, direction, chrom, threshold=35):
     """Given a reference position and a direction of travel, walk out and find the nearest primer site.
 
     Parameters
@@ -39,14 +39,14 @@ def find_primer(bed, pos, direction, threshold=20):
         primer_distances = [
             (abs(p["start"] - pos), p["start"] - pos, p)
             for p in bed
-            if (p["direction"] == direction) and (pos >= (p["start"] - threshold))
+            if (p["direction"] == direction) and (pos >= (p["start"] - threshold)) and chrom == p["chrom"]
         ]
 
     else:
         primer_distances = [
             (abs(p["end"] - pos), p["end"] - pos, p)
             for p in bed
-            if (p["direction"] == direction) and (pos <= (p["end"] + threshold))
+            if (p["direction"] == direction) and (pos <= (p["end"] + threshold)) and chrom == p["chrom"]
         ]
 
     if not primer_distances:
@@ -205,8 +205,10 @@ def handle_segment(
         return False
 
     # locate the nearest primers to this alignment segment
-    p1 = find_primer(bed, segment.reference_start, "+", args.primer_match_threshold)
-    p2 = find_primer(bed, segment.reference_end, "-", args.primer_match_threshold)
+    # p1 = find_primer(bed, segment.reference_start, "+", segment.reference_name, args.primer_match_threshold)
+    p1 = find_primer(bed=bed, pos=segment.reference_start, direction="+", chrom=segment.reference_name, threshold=args.primer_match_threshold)
+    # p2 = find_primer(bed, segment.reference_end, "-", segment.reference_name, args.primer_match_threshold)
+    p2 = find_primer(bed=bed, pos=segment.reference_end, direction="-", chrom=segment.reference_name, threshold=args.primer_match_threshold)
 
     if not p1 or not p2:
         if args.verbose:
@@ -235,6 +237,7 @@ def handle_segment(
     if args.report:
         # update the report with this alignment segment + primer details
         report = {
+            "chrom": segment.reference_name,
             "QueryName": segment.query_name,
             "ReferenceStart": segment.reference_start,
             "ReferenceEnd": segment.reference_end,
@@ -342,32 +345,33 @@ def generate_amplicons(bed: list):
 
         amplicon = primer["Primer_ID"].split("_")[1]
 
-        amplicons.setdefault(amplicon, {})
+        amplicons.setdefault(primer["chrom"], {})
+        amplicons[primer["chrom"]].setdefault(amplicon, {})
 
         if primer["direction"] == "+":
-            amplicons[amplicon]["p_start"] = primer["start"]
-            amplicons[amplicon]["start"] = primer["end"] + 1
+            amplicons[primer["chrom"]][amplicon]["p_start"] = primer["start"]
+            amplicons[primer["chrom"]][amplicon]["start"] = primer["end"] + 1
 
         elif primer["direction"] == "-":
-            amplicons[amplicon]["p_end"] = primer["end"]
-            amplicons[amplicon]["end"] = primer["start"] - 1
+            amplicons[primer["chrom"]][amplicon]["p_end"] = primer["end"]
+            amplicons[primer["chrom"]][amplicon]["end"] = primer["start"] - 1
 
         else:
             raise ValueError("Primer direction not recognised")
+    for chrom, amplicons_dict in amplicons.items():
+        for amplicon in amplicons_dict:
+            if not all([x in amplicons_dict[amplicon] for x in ["p_start", "p_end"]]):
+                raise ValueError(f"Primer scheme for amplicon {amplicon} for reference {chrom} is incomplete")
+
+            # Check if primer runs accross reference start / end -> circular virus
+            amplicons_dict[amplicon]["circular"] = (
+                amplicons_dict[amplicon]["p_start"] > amplicons_dict[amplicon]["p_end"]
+            )
 
-    for amplicon in amplicons:
-        if not all([x in amplicons[amplicon] for x in ["p_start", "p_end"]]):
-            raise ValueError(f"Primer scheme for amplicon {amplicon} is incomplete")
-
-        # Check if primer runs accross reference start / end -> circular virus
-        amplicons[amplicon]["circular"] = (
-            amplicons[amplicon]["p_start"] > amplicons[amplicon]["p_end"]
-        )
-
-        # Calculate amplicon length considering that the "length" may be negative if the genome is circular
-        amplicons[amplicon]["length"] = abs(
-            amplicons[amplicon]["p_end"] - amplicons[amplicon]["p_start"]
-        )
+            # Calculate amplicon length considering that the "length" may be negative if the genome is circular
+            amplicons_dict[amplicon]["length"] = abs(
+                amplicons_dict[amplicon]["p_end"] - amplicons_dict[amplicon]["p_start"]
+            )
 
     return amplicons
 
@@ -392,51 +396,53 @@ def normalise(trimmed_segments: dict, normalise: int, bed: list, verbose: bool =
 
     output_segments = []
 
-    mean_depths = {x: 0 for x in amplicons}
+    # mean_depths = {x: {} for x in amplicons}
+    mean_depths = {}
 
-    for amplicon, segments in trimmed_segments.items():
-        if amplicon not in amplicons:
-            raise ValueError(f"Segment {amplicon} not found in primer scheme file")
-
-        desired_depth = np.full_like(
-            (amplicons[amplicon]["length"],), normalise, dtype=int
-        )
+    for chrom, amplicon_dict in trimmed_segments.items():
+        for amplicon, segments in amplicon_dict.items():
+            if amplicon not in amplicons[chrom]:
+                raise ValueError(f"Segment {amplicon} not found in primer scheme file")
 
-        amplicon_depth = np.zeros((amplicons[amplicon]["length"],), dtype=int)
+            desired_depth = np.full_like(
+                (amplicons[chrom][amplicon]["length"],), normalise, dtype=int
+            )
 
-        if not segments:
-            if verbose:
-                print(
-                    f"No segments assigned to amplicon {amplicon}, skipping",
-                    file=sys.stderr,
-                )
-            continue
+            amplicon_depth = np.zeros((amplicons[chrom][amplicon]["length"],), dtype=int)
 
-        random.shuffle(segments)
+            if not segments:
+                if verbose:
+                    print(
+                        f"No segments assigned to amplicon {amplicon}, skipping",
+                        file=sys.stderr,
+                    )
+                continue
 
-        distance = np.mean(np.abs(amplicon_depth - desired_depth))
+            random.shuffle(segments)
 
-        for segment in segments:
-            test_depths = np.copy(amplicon_depth)
+            distance = np.mean(np.abs(amplicon_depth - desired_depth))
 
-            relative_start = segment.reference_start - amplicons[amplicon]["p_start"]
+            for segment in segments:
+                test_depths = np.copy(amplicon_depth)
 
-            if relative_start < 0:
-                relative_start = 0
+                relative_start = segment.reference_start - amplicons[chrom][amplicon]["p_start"]
 
-            relative_end = segment.reference_end - amplicons[amplicon]["p_start"]
+                if relative_start < 0:
+                    relative_start = 0
 
-            test_depths[relative_start:relative_end] += 1
+                relative_end = segment.reference_end - amplicons[chrom][amplicon]["p_start"]
 
-            test_distance = np.mean(np.abs(test_depths - desired_depth))
+                test_depths[relative_start:relative_end] += 1
 
-            if test_distance < distance:
-                amplicon_depth = test_depths
-                distance = test_distance
-                output_segments.append(segment)
+                test_distance = np.mean(np.abs(test_depths - desired_depth))
 
-        mean_depths[amplicon] = np.mean(amplicon_depth)
+                if test_distance < distance:
+                    amplicon_depth = test_depths
+                    distance = test_distance
+                    output_segments.append(segment)
 
+            mean_depths[(chrom, amplicon)] = np.mean(amplicon_depth)
+        
     return output_segments, mean_depths
 
 
@@ -449,6 +455,7 @@ def go(args):
     if args.report:
         reportfh = open(args.report, "w")
         report_headers = [
+            "chrom",
             "QueryName",
             "ReferenceStart",
             "ReferenceEnd",
@@ -469,6 +476,7 @@ def go(args):
     # open the primer scheme and get the pools
     bed = read_bed_file(args.bedfile)
     pools = set([row["PoolName"] for row in bed])
+    chroms = set([row["chrom"] for row in bed])
     pools.add("unmatched")
 
     # open the input SAM file and process read groups
@@ -484,7 +492,7 @@ def go(args):
     # prepare the alignment outfile
     outfile = pysam.AlignmentFile("-", "wh", header=bam_header)
 
-    trimmed_segments = {}
+    trimmed_segments = {x: {} for x in chroms}
 
     # iterate over the alignment segments in the input SAM file
     for segment in infile:
@@ -508,10 +516,10 @@ def go(args):
 
         # unpack the trimming tuple since segment passed trimming
         amplicon, trimmed_segment = trimming_tuple
-        trimmed_segments.setdefault(amplicon, [])
+        trimmed_segments[trimmed_segment.reference_name].setdefault(amplicon, [])
 
         if trimmed_segment:
-            trimmed_segments[amplicon].append(trimmed_segment)
+            trimmed_segments[trimmed_segment.reference_name][amplicon].append(trimmed_segment)
 
     # normalise if requested
     if args.normalise:
@@ -522,9 +530,9 @@ def go(args):
         # write mean amplicon depths to file
         if args.amp_depth_report:
             with open(args.amp_depth_report, "w") as amp_depth_report_fh:
-                amp_depth_report_fh.write("amplicon\tmean_depth\n")
-                for amplicon, depth in mean_amp_depths.items():
-                    amp_depth_report_fh.write(f"{amplicon}\t{depth}\n")
+                amp_depth_report_fh.write("chrom\tamplicon\tmean_depth\n")
+                for (chrom, amplicon), depth in mean_amp_depths.items():
+                    amp_depth_report_fh.write(f"{chrom}\t{amplicon}\t{depth}\n")
 
         for output_segment in output_segments:
             outfile.write(output_segment)
@@ -554,7 +562,7 @@ def main():
     parser.add_argument(
         "--primer-match-threshold",
         type=int,
-        default=5,
+        default=35,
         help="Fuzzy match primer positions within this threshold",
     )
     parser.add_argument("--report", type=str, help="Output report to file")
diff --git a/artic/fasta_header.py b/artic/fasta_header.py
index b470ae22..03ad2ee5 100644
--- a/artic/fasta_header.py
+++ b/artic/fasta_header.py
@@ -1,18 +1,16 @@
 from Bio import SeqIO
 
 
-def fasta_header(fn, header):
-    fh = open(fn)
-    rec = list(SeqIO.parse(fh, "fasta"))
-    if len(rec) > 1:
-        print("Sorry, this script doesn't support multi-FASTA files!")
-        raise SystemExit
-    fh.close()
+def fasta_header(args):
+    with open(args.filename) as fh:
+        rec = list(SeqIO.parse(fh, "fasta"))
 
-    fh = open(fn, "w")
-    rec[0].id = header
-    SeqIO.write([rec[0]], fh, "fasta")
-    fh.close()
+    with open(args.filename, "w") as fh:
+        for record in rec:
+            chrom = record.id
+            record.id = f"{args.samplename}/{chrom}/ARTIC/{args.caller}"
+
+            SeqIO.write(record, fh, "fasta")
 
 
 def main():
@@ -22,10 +20,11 @@ def main():
         description="Trim alignments from an amplicon scheme."
     )
     parser.add_argument("filename")
-    parser.add_argument("header")
+    parser.add_argument("samplename")
+    parser.add_argument("caller")
 
     args = parser.parse_args()
-    fasta_header(args.filename, args.header)
+    fasta_header(args)
 
 
 if __name__ == "__main__":
diff --git a/artic/minion.py b/artic/minion.py
index 42e42ee1..33b19728 100755
--- a/artic/minion.py
+++ b/artic/minion.py
@@ -135,20 +135,22 @@ def run(parser, args):
         normalise_string = ""
 
     cmds.append(
-        f"align_trim {normalise_string} {bed} --primer-match-threshold {args.primer_match_threshold} --remove-incorrect-pairs --min-mapq {args.min_mapq} --report {args.sample}.alignreport.csv < {args.sample}.sorted.bam > {args.sample}.trimmed.rg.bam"
+        f"align_trim {normalise_string} {bed} --primer-match-threshold {args.primer_match_threshold} --remove-incorrect-pairs --min-mapq {args.min_mapq} --report {args.sample}.alignreport.csv < {args.sample}.sorted.bam > {args.sample}.trimmed.rg.sam"
     )
 
     cmds.append(
-        f"samtools sort -T {args.sample} {args.sample}.trimmed.rg.bam -o {args.sample}.trimmed.rg.sorted.bam"
+        f"samtools sort -T {args.sample} {args.sample}.trimmed.rg.sam -o {args.sample}.trimmed.rg.sorted.bam"
     )
+    cmds.append(f"rm {args.sample}.trimmed.rg.sam")
 
     cmds.append(
-        f"align_trim {normalise_string} {bed} --primer-match-threshold {args.primer_match_threshold} --min-mapq {args.min_mapq} --remove-incorrect-pairs --trim-primers --report {args.sample}.alignreport.csv --amp-depth-report {args.sample}.amplicon_depths.tsv < {args.sample}.sorted.bam > {args.sample}.primertrimmed.rg.bam"
+        f"align_trim {normalise_string} {bed} --primer-match-threshold {args.primer_match_threshold} --min-mapq {args.min_mapq} --remove-incorrect-pairs --trim-primers --report {args.sample}.alignreport.csv --amp-depth-report {args.sample}.amplicon_depths.tsv < {args.sample}.sorted.bam > {args.sample}.primertrimmed.rg.sam"
     )
 
     cmds.append(
-        f"samtools sort -T {args.sample} {args.sample}.primertrimmed.rg.bam -o {args.sample}.primertrimmed.rg.sorted.bam"
+        f"samtools sort -T {args.sample} {args.sample}.primertrimmed.rg.sam -o {args.sample}.primertrimmed.rg.sorted.bam"
     )
+    cmds.append(f"rm {args.sample}.primertrimmed.rg.sam")
 
     cmds.append(f"samtools index {args.sample}.trimmed.rg.sorted.bam")
     cmds.append(f"samtools index {args.sample}.primertrimmed.rg.sorted.bam")
@@ -223,9 +225,8 @@ def run(parser, args):
 
     # 11) apply the header to the consensus sequence and run alignment against the reference sequence
     caller = "clair3"
-    fasta_header = f"{args.sample}/ARTIC/{caller}"
     cmds.append(
-        'artic_fasta_header %s.consensus.fasta "%s"' % (args.sample, fasta_header)
+        f"artic_fasta_header {args.sample}.consensus.fasta {args.sample} {caller}"
     )
 
     if args.align_consensus:
diff --git a/artic/utils.py b/artic/utils.py
index b96e95fe..1a7b46e9 100644
--- a/artic/utils.py
+++ b/artic/utils.py
@@ -362,12 +362,12 @@ def read_bed_file(fn):
     for _, row in primers.iterrows():
         scheme_name, primer_id, direction, primer_n = row["Primer_ID"].split("_")
 
-        if (primer_id, direction) not in canonical_primers:
-            canonical_primers[(primer_id, direction)] = row.to_dict()
+        if (row["chrom"], primer_id, direction) not in canonical_primers:
+            canonical_primers[(row["chrom"], primer_id, direction)] = row.to_dict()
             continue
 
-        canonical_primers[(primer_id, direction)] = merge_sites(
-            canonical_primers[(primer_id, direction)], row
+        canonical_primers[(row["chrom"], primer_id, direction)] = merge_sites(
+            canonical_primers[(row["chrom"], primer_id, direction)], row
         )
 
     # return the bedFile as a list
diff --git a/tests/align_trim_unit_test.py b/tests/align_trim_unit_test.py
index d8bebab9..3e8fca6c 100644
--- a/tests/align_trim_unit_test.py
+++ b/tests/align_trim_unit_test.py
@@ -10,10 +10,34 @@
 TEST_DIR = os.path.dirname(os.path.abspath(__file__))
 
 # dummy primers (using min required fields)
-p1 = {"start": 0, "end": 10, "direction": "+", "primerID": "primer1_LEFT"}
-p2 = {"start": 30, "end": 40, "direction": "-", "primerID": "primer1_RIGHT"}
-p3 = {"start": 10, "end": 20, "direction": "+", "primerID": "primer2_LEFT"}
-p4 = {"start": 40, "end": 50, "direction": "-", "primerID": "primer2_RIGHT"}
+p1 = {
+    "chrom": "MN908947.3",
+    "start": 0,
+    "end": 10,
+    "direction": "+",
+    "primerID": "primer1_LEFT",
+}
+p2 = {
+    "chrom": "MN908947.3",
+    "start": 30,
+    "end": 40,
+    "direction": "-",
+    "primerID": "primer1_RIGHT",
+}
+p3 = {
+    "chrom": "MN908947.3",
+    "start": 10,
+    "end": 20,
+    "direction": "+",
+    "primerID": "primer2_LEFT",
+}
+p4 = {
+    "chrom": "MN908947.3",
+    "start": 40,
+    "end": 50,
+    "direction": "-",
+    "primerID": "primer2_RIGHT",
+}
 
 # primer scheme to hold dummy primers
 dummyPrimerScheme = [p1, p2, p3, p4]
@@ -23,8 +47,10 @@
     TEST_DIR + "/../test-data/primer-schemes/nCoV-2019/V1/nCoV-2019.scheme.bed"
 )
 
+header = pysam.AlignmentHeader.from_dict({"SQ": [{"LN": 29903, "SN": "MN908947.3"}]})
+
 # nCov alignment segment (derived from a real nCov read)
-seg1 = pysam.AlignedSegment()
+seg1 = pysam.AlignedSegment(header=header)
 seg1.query_name = "0be29940-97ae-440e-b02c-07748edeceec"
 seg1.flag = 0
 seg1.reference_id = 0
@@ -35,7 +61,7 @@
 seg1.query_qualities = [30] * 490
 
 # nCov alignment segment (derived from a real nCov read) - will result in a leading CIGAR deletion during softmasking
-seg2 = pysam.AlignedSegment()
+seg2 = pysam.AlignedSegment(header=header)
 seg2.query_name = "15c86d34-a527-4506-9b0c-f62827d01555"
 seg2.flag = 0
 seg2.reference_id = 0
@@ -60,11 +86,11 @@ def test_find_primer():
     for primer in dummyPrimerScheme:
         if primer["direction"] == "+":
             result = align_trim.find_primer(
-                dummyPrimerScheme, primer["start"], primer["direction"]
+                dummyPrimerScheme, primer["start"], primer["direction"], "MN908947.3"
             )
         else:
             result = align_trim.find_primer(
-                dummyPrimerScheme, primer["end"], primer["direction"]
+                dummyPrimerScheme, primer["end"], primer["direction"], "MN908947.3"
             )
 
         assert result
@@ -73,13 +99,13 @@ def test_find_primer():
         ), "find_primer did not produce the query primer, which should be nearest"
 
     # test against other ref positions
-    result = align_trim.find_primer(dummyPrimerScheme, 8, "+")
+    result = align_trim.find_primer(dummyPrimerScheme, 8, "+", "MN908947.3")
     assert result
     assert (
         result[2]["primerID"] == "primer2_LEFT"
     ), "find_primer returned incorrect primer"
 
-    result = align_trim.find_primer(dummyPrimerScheme, 25, "-")
+    result = align_trim.find_primer(dummyPrimerScheme, 25, "-", "MN908947.3")
     assert result
     assert (
         result[2]["primerID"] == "primer1_RIGHT"
@@ -92,10 +118,13 @@ def test_trim():
     def testRunner(seg, expectedCIGAR):
 
         # get the nearest primers to the alignment segment
-        p1 = align_trim.find_primer(primerScheme, seg.reference_start, "+")
-        p2 = align_trim.find_primer(primerScheme, seg.reference_end, "-")
+        p1 = align_trim.find_primer(
+            primerScheme, seg.reference_start, "+", "MN908947.3"
+        )
+        p2 = align_trim.find_primer(primerScheme, seg.reference_end, "-", "MN908947.3")
 
         # get the primer positions
+        print(p1, p2)
         p1_position = p1[2]["end"]
         p2_position = p2[2]["start"]