From 94cc65daf89419340da35215b1f689245e155471 Mon Sep 17 00:00:00 2001 From: Sam W Date: Mon, 6 Jan 2025 16:41:23 +0000 Subject: [PATCH] Add support for segmented genomes / amplicon panels (#146) * First test for multiref * Multiref changes * Fix align trim unit test * fix rehead fasta --- artic/align_trim.py | 130 ++++++++++++++++++---------------- artic/fasta_header.py | 25 ++++--- artic/minion.py | 13 ++-- artic/utils.py | 8 +-- tests/align_trim_unit_test.py | 53 ++++++++++---- 5 files changed, 133 insertions(+), 96 deletions(-) diff --git a/artic/align_trim.py b/artic/align_trim.py index 3c31b030..882b68e1 100755 --- a/artic/align_trim.py +++ b/artic/align_trim.py @@ -16,7 +16,7 @@ consumesQuery = [True, True, False, False, True, False, False, True] -def find_primer(bed, pos, direction, threshold=20): +def find_primer(bed, pos, direction, chrom, threshold=35): """Given a reference position and a direction of travel, walk out and find the nearest primer site. Parameters @@ -39,14 +39,14 @@ def find_primer(bed, pos, direction, threshold=20): primer_distances = [ (abs(p["start"] - pos), p["start"] - pos, p) for p in bed - if (p["direction"] == direction) and (pos >= (p["start"] - threshold)) + if (p["direction"] == direction) and (pos >= (p["start"] - threshold)) and chrom == p["chrom"] ] else: primer_distances = [ (abs(p["end"] - pos), p["end"] - pos, p) for p in bed - if (p["direction"] == direction) and (pos <= (p["end"] + threshold)) + if (p["direction"] == direction) and (pos <= (p["end"] + threshold)) and chrom == p["chrom"] ] if not primer_distances: @@ -205,8 +205,10 @@ def handle_segment( return False # locate the nearest primers to this alignment segment - p1 = find_primer(bed, segment.reference_start, "+", args.primer_match_threshold) - p2 = find_primer(bed, segment.reference_end, "-", args.primer_match_threshold) + # p1 = find_primer(bed, segment.reference_start, "+", segment.reference_name, args.primer_match_threshold) + p1 = find_primer(bed=bed, pos=segment.reference_start, direction="+", chrom=segment.reference_name, threshold=args.primer_match_threshold) + # p2 = find_primer(bed, segment.reference_end, "-", segment.reference_name, args.primer_match_threshold) + p2 = find_primer(bed=bed, pos=segment.reference_end, direction="-", chrom=segment.reference_name, threshold=args.primer_match_threshold) if not p1 or not p2: if args.verbose: @@ -235,6 +237,7 @@ def handle_segment( if args.report: # update the report with this alignment segment + primer details report = { + "chrom": segment.reference_name, "QueryName": segment.query_name, "ReferenceStart": segment.reference_start, "ReferenceEnd": segment.reference_end, @@ -342,32 +345,33 @@ def generate_amplicons(bed: list): amplicon = primer["Primer_ID"].split("_")[1] - amplicons.setdefault(amplicon, {}) + amplicons.setdefault(primer["chrom"], {}) + amplicons[primer["chrom"]].setdefault(amplicon, {}) if primer["direction"] == "+": - amplicons[amplicon]["p_start"] = primer["start"] - amplicons[amplicon]["start"] = primer["end"] + 1 + amplicons[primer["chrom"]][amplicon]["p_start"] = primer["start"] + amplicons[primer["chrom"]][amplicon]["start"] = primer["end"] + 1 elif primer["direction"] == "-": - amplicons[amplicon]["p_end"] = primer["end"] - amplicons[amplicon]["end"] = primer["start"] - 1 + amplicons[primer["chrom"]][amplicon]["p_end"] = primer["end"] + amplicons[primer["chrom"]][amplicon]["end"] = primer["start"] - 1 else: raise ValueError("Primer direction not recognised") + for chrom, amplicons_dict in amplicons.items(): + for amplicon in amplicons_dict: + if not all([x in amplicons_dict[amplicon] for x in ["p_start", "p_end"]]): + raise ValueError(f"Primer scheme for amplicon {amplicon} for reference {chrom} is incomplete") + + # Check if primer runs accross reference start / end -> circular virus + amplicons_dict[amplicon]["circular"] = ( + amplicons_dict[amplicon]["p_start"] > amplicons_dict[amplicon]["p_end"] + ) - for amplicon in amplicons: - if not all([x in amplicons[amplicon] for x in ["p_start", "p_end"]]): - raise ValueError(f"Primer scheme for amplicon {amplicon} is incomplete") - - # Check if primer runs accross reference start / end -> circular virus - amplicons[amplicon]["circular"] = ( - amplicons[amplicon]["p_start"] > amplicons[amplicon]["p_end"] - ) - - # Calculate amplicon length considering that the "length" may be negative if the genome is circular - amplicons[amplicon]["length"] = abs( - amplicons[amplicon]["p_end"] - amplicons[amplicon]["p_start"] - ) + # Calculate amplicon length considering that the "length" may be negative if the genome is circular + amplicons_dict[amplicon]["length"] = abs( + amplicons_dict[amplicon]["p_end"] - amplicons_dict[amplicon]["p_start"] + ) return amplicons @@ -392,51 +396,53 @@ def normalise(trimmed_segments: dict, normalise: int, bed: list, verbose: bool = output_segments = [] - mean_depths = {x: 0 for x in amplicons} + # mean_depths = {x: {} for x in amplicons} + mean_depths = {} - for amplicon, segments in trimmed_segments.items(): - if amplicon not in amplicons: - raise ValueError(f"Segment {amplicon} not found in primer scheme file") - - desired_depth = np.full_like( - (amplicons[amplicon]["length"],), normalise, dtype=int - ) + for chrom, amplicon_dict in trimmed_segments.items(): + for amplicon, segments in amplicon_dict.items(): + if amplicon not in amplicons[chrom]: + raise ValueError(f"Segment {amplicon} not found in primer scheme file") - amplicon_depth = np.zeros((amplicons[amplicon]["length"],), dtype=int) + desired_depth = np.full_like( + (amplicons[chrom][amplicon]["length"],), normalise, dtype=int + ) - if not segments: - if verbose: - print( - f"No segments assigned to amplicon {amplicon}, skipping", - file=sys.stderr, - ) - continue + amplicon_depth = np.zeros((amplicons[chrom][amplicon]["length"],), dtype=int) - random.shuffle(segments) + if not segments: + if verbose: + print( + f"No segments assigned to amplicon {amplicon}, skipping", + file=sys.stderr, + ) + continue - distance = np.mean(np.abs(amplicon_depth - desired_depth)) + random.shuffle(segments) - for segment in segments: - test_depths = np.copy(amplicon_depth) + distance = np.mean(np.abs(amplicon_depth - desired_depth)) - relative_start = segment.reference_start - amplicons[amplicon]["p_start"] + for segment in segments: + test_depths = np.copy(amplicon_depth) - if relative_start < 0: - relative_start = 0 + relative_start = segment.reference_start - amplicons[chrom][amplicon]["p_start"] - relative_end = segment.reference_end - amplicons[amplicon]["p_start"] + if relative_start < 0: + relative_start = 0 - test_depths[relative_start:relative_end] += 1 + relative_end = segment.reference_end - amplicons[chrom][amplicon]["p_start"] - test_distance = np.mean(np.abs(test_depths - desired_depth)) + test_depths[relative_start:relative_end] += 1 - if test_distance < distance: - amplicon_depth = test_depths - distance = test_distance - output_segments.append(segment) + test_distance = np.mean(np.abs(test_depths - desired_depth)) - mean_depths[amplicon] = np.mean(amplicon_depth) + if test_distance < distance: + amplicon_depth = test_depths + distance = test_distance + output_segments.append(segment) + mean_depths[(chrom, amplicon)] = np.mean(amplicon_depth) + return output_segments, mean_depths @@ -449,6 +455,7 @@ def go(args): if args.report: reportfh = open(args.report, "w") report_headers = [ + "chrom", "QueryName", "ReferenceStart", "ReferenceEnd", @@ -469,6 +476,7 @@ def go(args): # open the primer scheme and get the pools bed = read_bed_file(args.bedfile) pools = set([row["PoolName"] for row in bed]) + chroms = set([row["chrom"] for row in bed]) pools.add("unmatched") # open the input SAM file and process read groups @@ -484,7 +492,7 @@ def go(args): # prepare the alignment outfile outfile = pysam.AlignmentFile("-", "wh", header=bam_header) - trimmed_segments = {} + trimmed_segments = {x: {} for x in chroms} # iterate over the alignment segments in the input SAM file for segment in infile: @@ -508,10 +516,10 @@ def go(args): # unpack the trimming tuple since segment passed trimming amplicon, trimmed_segment = trimming_tuple - trimmed_segments.setdefault(amplicon, []) + trimmed_segments[trimmed_segment.reference_name].setdefault(amplicon, []) if trimmed_segment: - trimmed_segments[amplicon].append(trimmed_segment) + trimmed_segments[trimmed_segment.reference_name][amplicon].append(trimmed_segment) # normalise if requested if args.normalise: @@ -522,9 +530,9 @@ def go(args): # write mean amplicon depths to file if args.amp_depth_report: with open(args.amp_depth_report, "w") as amp_depth_report_fh: - amp_depth_report_fh.write("amplicon\tmean_depth\n") - for amplicon, depth in mean_amp_depths.items(): - amp_depth_report_fh.write(f"{amplicon}\t{depth}\n") + amp_depth_report_fh.write("chrom\tamplicon\tmean_depth\n") + for (chrom, amplicon), depth in mean_amp_depths.items(): + amp_depth_report_fh.write(f"{chrom}\t{amplicon}\t{depth}\n") for output_segment in output_segments: outfile.write(output_segment) @@ -554,7 +562,7 @@ def main(): parser.add_argument( "--primer-match-threshold", type=int, - default=5, + default=35, help="Fuzzy match primer positions within this threshold", ) parser.add_argument("--report", type=str, help="Output report to file") diff --git a/artic/fasta_header.py b/artic/fasta_header.py index b470ae22..03ad2ee5 100644 --- a/artic/fasta_header.py +++ b/artic/fasta_header.py @@ -1,18 +1,16 @@ from Bio import SeqIO -def fasta_header(fn, header): - fh = open(fn) - rec = list(SeqIO.parse(fh, "fasta")) - if len(rec) > 1: - print("Sorry, this script doesn't support multi-FASTA files!") - raise SystemExit - fh.close() +def fasta_header(args): + with open(args.filename) as fh: + rec = list(SeqIO.parse(fh, "fasta")) - fh = open(fn, "w") - rec[0].id = header - SeqIO.write([rec[0]], fh, "fasta") - fh.close() + with open(args.filename, "w") as fh: + for record in rec: + chrom = record.id + record.id = f"{args.samplename}/{chrom}/ARTIC/{args.caller}" + + SeqIO.write(record, fh, "fasta") def main(): @@ -22,10 +20,11 @@ def main(): description="Trim alignments from an amplicon scheme." ) parser.add_argument("filename") - parser.add_argument("header") + parser.add_argument("samplename") + parser.add_argument("caller") args = parser.parse_args() - fasta_header(args.filename, args.header) + fasta_header(args) if __name__ == "__main__": diff --git a/artic/minion.py b/artic/minion.py index 42e42ee1..33b19728 100755 --- a/artic/minion.py +++ b/artic/minion.py @@ -135,20 +135,22 @@ def run(parser, args): normalise_string = "" cmds.append( - f"align_trim {normalise_string} {bed} --primer-match-threshold {args.primer_match_threshold} --remove-incorrect-pairs --min-mapq {args.min_mapq} --report {args.sample}.alignreport.csv < {args.sample}.sorted.bam > {args.sample}.trimmed.rg.bam" + f"align_trim {normalise_string} {bed} --primer-match-threshold {args.primer_match_threshold} --remove-incorrect-pairs --min-mapq {args.min_mapq} --report {args.sample}.alignreport.csv < {args.sample}.sorted.bam > {args.sample}.trimmed.rg.sam" ) cmds.append( - f"samtools sort -T {args.sample} {args.sample}.trimmed.rg.bam -o {args.sample}.trimmed.rg.sorted.bam" + f"samtools sort -T {args.sample} {args.sample}.trimmed.rg.sam -o {args.sample}.trimmed.rg.sorted.bam" ) + cmds.append(f"rm {args.sample}.trimmed.rg.sam") cmds.append( - f"align_trim {normalise_string} {bed} --primer-match-threshold {args.primer_match_threshold} --min-mapq {args.min_mapq} --remove-incorrect-pairs --trim-primers --report {args.sample}.alignreport.csv --amp-depth-report {args.sample}.amplicon_depths.tsv < {args.sample}.sorted.bam > {args.sample}.primertrimmed.rg.bam" + f"align_trim {normalise_string} {bed} --primer-match-threshold {args.primer_match_threshold} --min-mapq {args.min_mapq} --remove-incorrect-pairs --trim-primers --report {args.sample}.alignreport.csv --amp-depth-report {args.sample}.amplicon_depths.tsv < {args.sample}.sorted.bam > {args.sample}.primertrimmed.rg.sam" ) cmds.append( - f"samtools sort -T {args.sample} {args.sample}.primertrimmed.rg.bam -o {args.sample}.primertrimmed.rg.sorted.bam" + f"samtools sort -T {args.sample} {args.sample}.primertrimmed.rg.sam -o {args.sample}.primertrimmed.rg.sorted.bam" ) + cmds.append(f"rm {args.sample}.primertrimmed.rg.sam") cmds.append(f"samtools index {args.sample}.trimmed.rg.sorted.bam") cmds.append(f"samtools index {args.sample}.primertrimmed.rg.sorted.bam") @@ -223,9 +225,8 @@ def run(parser, args): # 11) apply the header to the consensus sequence and run alignment against the reference sequence caller = "clair3" - fasta_header = f"{args.sample}/ARTIC/{caller}" cmds.append( - 'artic_fasta_header %s.consensus.fasta "%s"' % (args.sample, fasta_header) + f"artic_fasta_header {args.sample}.consensus.fasta {args.sample} {caller}" ) if args.align_consensus: diff --git a/artic/utils.py b/artic/utils.py index b96e95fe..1a7b46e9 100644 --- a/artic/utils.py +++ b/artic/utils.py @@ -362,12 +362,12 @@ def read_bed_file(fn): for _, row in primers.iterrows(): scheme_name, primer_id, direction, primer_n = row["Primer_ID"].split("_") - if (primer_id, direction) not in canonical_primers: - canonical_primers[(primer_id, direction)] = row.to_dict() + if (row["chrom"], primer_id, direction) not in canonical_primers: + canonical_primers[(row["chrom"], primer_id, direction)] = row.to_dict() continue - canonical_primers[(primer_id, direction)] = merge_sites( - canonical_primers[(primer_id, direction)], row + canonical_primers[(row["chrom"], primer_id, direction)] = merge_sites( + canonical_primers[(row["chrom"], primer_id, direction)], row ) # return the bedFile as a list diff --git a/tests/align_trim_unit_test.py b/tests/align_trim_unit_test.py index d8bebab9..3e8fca6c 100644 --- a/tests/align_trim_unit_test.py +++ b/tests/align_trim_unit_test.py @@ -10,10 +10,34 @@ TEST_DIR = os.path.dirname(os.path.abspath(__file__)) # dummy primers (using min required fields) -p1 = {"start": 0, "end": 10, "direction": "+", "primerID": "primer1_LEFT"} -p2 = {"start": 30, "end": 40, "direction": "-", "primerID": "primer1_RIGHT"} -p3 = {"start": 10, "end": 20, "direction": "+", "primerID": "primer2_LEFT"} -p4 = {"start": 40, "end": 50, "direction": "-", "primerID": "primer2_RIGHT"} +p1 = { + "chrom": "MN908947.3", + "start": 0, + "end": 10, + "direction": "+", + "primerID": "primer1_LEFT", +} +p2 = { + "chrom": "MN908947.3", + "start": 30, + "end": 40, + "direction": "-", + "primerID": "primer1_RIGHT", +} +p3 = { + "chrom": "MN908947.3", + "start": 10, + "end": 20, + "direction": "+", + "primerID": "primer2_LEFT", +} +p4 = { + "chrom": "MN908947.3", + "start": 40, + "end": 50, + "direction": "-", + "primerID": "primer2_RIGHT", +} # primer scheme to hold dummy primers dummyPrimerScheme = [p1, p2, p3, p4] @@ -23,8 +47,10 @@ TEST_DIR + "/../test-data/primer-schemes/nCoV-2019/V1/nCoV-2019.scheme.bed" ) +header = pysam.AlignmentHeader.from_dict({"SQ": [{"LN": 29903, "SN": "MN908947.3"}]}) + # nCov alignment segment (derived from a real nCov read) -seg1 = pysam.AlignedSegment() +seg1 = pysam.AlignedSegment(header=header) seg1.query_name = "0be29940-97ae-440e-b02c-07748edeceec" seg1.flag = 0 seg1.reference_id = 0 @@ -35,7 +61,7 @@ seg1.query_qualities = [30] * 490 # nCov alignment segment (derived from a real nCov read) - will result in a leading CIGAR deletion during softmasking -seg2 = pysam.AlignedSegment() +seg2 = pysam.AlignedSegment(header=header) seg2.query_name = "15c86d34-a527-4506-9b0c-f62827d01555" seg2.flag = 0 seg2.reference_id = 0 @@ -60,11 +86,11 @@ def test_find_primer(): for primer in dummyPrimerScheme: if primer["direction"] == "+": result = align_trim.find_primer( - dummyPrimerScheme, primer["start"], primer["direction"] + dummyPrimerScheme, primer["start"], primer["direction"], "MN908947.3" ) else: result = align_trim.find_primer( - dummyPrimerScheme, primer["end"], primer["direction"] + dummyPrimerScheme, primer["end"], primer["direction"], "MN908947.3" ) assert result @@ -73,13 +99,13 @@ def test_find_primer(): ), "find_primer did not produce the query primer, which should be nearest" # test against other ref positions - result = align_trim.find_primer(dummyPrimerScheme, 8, "+") + result = align_trim.find_primer(dummyPrimerScheme, 8, "+", "MN908947.3") assert result assert ( result[2]["primerID"] == "primer2_LEFT" ), "find_primer returned incorrect primer" - result = align_trim.find_primer(dummyPrimerScheme, 25, "-") + result = align_trim.find_primer(dummyPrimerScheme, 25, "-", "MN908947.3") assert result assert ( result[2]["primerID"] == "primer1_RIGHT" @@ -92,10 +118,13 @@ def test_trim(): def testRunner(seg, expectedCIGAR): # get the nearest primers to the alignment segment - p1 = align_trim.find_primer(primerScheme, seg.reference_start, "+") - p2 = align_trim.find_primer(primerScheme, seg.reference_end, "-") + p1 = align_trim.find_primer( + primerScheme, seg.reference_start, "+", "MN908947.3" + ) + p2 = align_trim.find_primer(primerScheme, seg.reference_end, "-", "MN908947.3") # get the primer positions + print(p1, p2) p1_position = p1[2]["end"] p2_position = p2[2]["start"]