Skip to content

Commit 4d68571

Browse files
committed
First test for multiref
1 parent 6cc5f5d commit 4d68571

File tree

3 files changed

+79
-69
lines changed

3 files changed

+79
-69
lines changed

artic/align_trim.py

Lines changed: 69 additions & 61 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
consumesQuery = [True, True, False, False, True, False, False, True]
1717

1818

19-
def find_primer(bed, pos, direction, threshold=20):
19+
def find_primer(bed, pos, direction, chrom, threshold=35):
2020
"""Given a reference position and a direction of travel, walk out and find the nearest primer site.
2121
2222
Parameters
@@ -39,14 +39,14 @@ def find_primer(bed, pos, direction, threshold=20):
3939
primer_distances = [
4040
(abs(p["start"] - pos), p["start"] - pos, p)
4141
for p in bed
42-
if (p["direction"] == direction) and (pos >= (p["start"] - threshold))
42+
if (p["direction"] == direction) and (pos >= (p["start"] - threshold)) and chrom == p["chrom"]
4343
]
4444

4545
else:
4646
primer_distances = [
4747
(abs(p["end"] - pos), p["end"] - pos, p)
4848
for p in bed
49-
if (p["direction"] == direction) and (pos <= (p["end"] + threshold))
49+
if (p["direction"] == direction) and (pos <= (p["end"] + threshold)) and chrom == p["chrom"]
5050
]
5151

5252
if not primer_distances:
@@ -205,8 +205,10 @@ def handle_segment(
205205
return False
206206

207207
# locate the nearest primers to this alignment segment
208-
p1 = find_primer(bed, segment.reference_start, "+", args.primer_match_threshold)
209-
p2 = find_primer(bed, segment.reference_end, "-", args.primer_match_threshold)
208+
# p1 = find_primer(bed, segment.reference_start, "+", segment.reference_name, args.primer_match_threshold)
209+
p1 = find_primer(bed=bed, pos=segment.reference_start, direction="+", chrom=segment.reference_name, threshold=args.primer_match_threshold)
210+
# p2 = find_primer(bed, segment.reference_end, "-", segment.reference_name, args.primer_match_threshold)
211+
p2 = find_primer(bed=bed, pos=segment.reference_end, direction="-", chrom=segment.reference_name, threshold=args.primer_match_threshold)
210212

211213
if not p1 or not p2:
212214
if args.verbose:
@@ -235,6 +237,7 @@ def handle_segment(
235237
if args.report:
236238
# update the report with this alignment segment + primer details
237239
report = {
240+
"chrom": segment.reference_name,
238241
"QueryName": segment.query_name,
239242
"ReferenceStart": segment.reference_start,
240243
"ReferenceEnd": segment.reference_end,
@@ -342,32 +345,33 @@ def generate_amplicons(bed: list):
342345

343346
amplicon = primer["Primer_ID"].split("_")[1]
344347

345-
amplicons.setdefault(amplicon, {})
348+
amplicons.setdefault(primer["chrom"], {})
349+
amplicons[primer["chrom"]].setdefault(amplicon, {})
346350

347351
if primer["direction"] == "+":
348-
amplicons[amplicon]["p_start"] = primer["start"]
349-
amplicons[amplicon]["start"] = primer["end"] + 1
352+
amplicons[primer["chrom"]][amplicon]["p_start"] = primer["start"]
353+
amplicons[primer["chrom"]][amplicon]["start"] = primer["end"] + 1
350354

351355
elif primer["direction"] == "-":
352-
amplicons[amplicon]["p_end"] = primer["end"]
353-
amplicons[amplicon]["end"] = primer["start"] - 1
356+
amplicons[primer["chrom"]][amplicon]["p_end"] = primer["end"]
357+
amplicons[primer["chrom"]][amplicon]["end"] = primer["start"] - 1
354358

355359
else:
356360
raise ValueError("Primer direction not recognised")
361+
for chrom, amplicons_dict in amplicons.items():
362+
for amplicon in amplicons_dict:
363+
if not all([x in amplicons_dict[amplicon] for x in ["p_start", "p_end"]]):
364+
raise ValueError(f"Primer scheme for amplicon {amplicon} for reference {chrom} is incomplete")
365+
366+
# Check if primer runs accross reference start / end -> circular virus
367+
amplicons_dict[amplicon]["circular"] = (
368+
amplicons_dict[amplicon]["p_start"] > amplicons_dict[amplicon]["p_end"]
369+
)
357370

358-
for amplicon in amplicons:
359-
if not all([x in amplicons[amplicon] for x in ["p_start", "p_end"]]):
360-
raise ValueError(f"Primer scheme for amplicon {amplicon} is incomplete")
361-
362-
# Check if primer runs accross reference start / end -> circular virus
363-
amplicons[amplicon]["circular"] = (
364-
amplicons[amplicon]["p_start"] > amplicons[amplicon]["p_end"]
365-
)
366-
367-
# Calculate amplicon length considering that the "length" may be negative if the genome is circular
368-
amplicons[amplicon]["length"] = abs(
369-
amplicons[amplicon]["p_end"] - amplicons[amplicon]["p_start"]
370-
)
371+
# Calculate amplicon length considering that the "length" may be negative if the genome is circular
372+
amplicons_dict[amplicon]["length"] = abs(
373+
amplicons_dict[amplicon]["p_end"] - amplicons_dict[amplicon]["p_start"]
374+
)
371375

372376
return amplicons
373377

@@ -392,51 +396,53 @@ def normalise(trimmed_segments: dict, normalise: int, bed: list, verbose: bool =
392396

393397
output_segments = []
394398

395-
mean_depths = {x: 0 for x in amplicons}
399+
# mean_depths = {x: {} for x in amplicons}
400+
mean_depths = {}
396401

397-
for amplicon, segments in trimmed_segments.items():
398-
if amplicon not in amplicons:
399-
raise ValueError(f"Segment {amplicon} not found in primer scheme file")
400-
401-
desired_depth = np.full_like(
402-
(amplicons[amplicon]["length"],), normalise, dtype=int
403-
)
402+
for chrom, amplicon_dict in trimmed_segments.items():
403+
for amplicon, segments in amplicon_dict.items():
404+
if amplicon not in amplicons[chrom]:
405+
raise ValueError(f"Segment {amplicon} not found in primer scheme file")
404406

405-
amplicon_depth = np.zeros((amplicons[amplicon]["length"],), dtype=int)
407+
desired_depth = np.full_like(
408+
(amplicons[chrom][amplicon]["length"],), normalise, dtype=int
409+
)
406410

407-
if not segments:
408-
if verbose:
409-
print(
410-
f"No segments assigned to amplicon {amplicon}, skipping",
411-
file=sys.stderr,
412-
)
413-
continue
411+
amplicon_depth = np.zeros((amplicons[chrom][amplicon]["length"],), dtype=int)
414412

415-
random.shuffle(segments)
413+
if not segments:
414+
if verbose:
415+
print(
416+
f"No segments assigned to amplicon {amplicon}, skipping",
417+
file=sys.stderr,
418+
)
419+
continue
416420

417-
distance = np.mean(np.abs(amplicon_depth - desired_depth))
421+
random.shuffle(segments)
418422

419-
for segment in segments:
420-
test_depths = np.copy(amplicon_depth)
423+
distance = np.mean(np.abs(amplicon_depth - desired_depth))
421424

422-
relative_start = segment.reference_start - amplicons[amplicon]["p_start"]
425+
for segment in segments:
426+
test_depths = np.copy(amplicon_depth)
423427

424-
if relative_start < 0:
425-
relative_start = 0
428+
relative_start = segment.reference_start - amplicons[chrom][amplicon]["p_start"]
426429

427-
relative_end = segment.reference_end - amplicons[amplicon]["p_start"]
430+
if relative_start < 0:
431+
relative_start = 0
428432

429-
test_depths[relative_start:relative_end] += 1
433+
relative_end = segment.reference_end - amplicons[chrom][amplicon]["p_start"]
430434

431-
test_distance = np.mean(np.abs(test_depths - desired_depth))
435+
test_depths[relative_start:relative_end] += 1
432436

433-
if test_distance < distance:
434-
amplicon_depth = test_depths
435-
distance = test_distance
436-
output_segments.append(segment)
437+
test_distance = np.mean(np.abs(test_depths - desired_depth))
437438

438-
mean_depths[amplicon] = np.mean(amplicon_depth)
439+
if test_distance < distance:
440+
amplicon_depth = test_depths
441+
distance = test_distance
442+
output_segments.append(segment)
439443

444+
mean_depths[(chrom, amplicon)] = np.mean(amplicon_depth)
445+
440446
return output_segments, mean_depths
441447

442448

@@ -449,6 +455,7 @@ def go(args):
449455
if args.report:
450456
reportfh = open(args.report, "w")
451457
report_headers = [
458+
"chrom",
452459
"QueryName",
453460
"ReferenceStart",
454461
"ReferenceEnd",
@@ -469,6 +476,7 @@ def go(args):
469476
# open the primer scheme and get the pools
470477
bed = read_bed_file(args.bedfile)
471478
pools = set([row["PoolName"] for row in bed])
479+
chroms = set([row["chrom"] for row in bed])
472480
pools.add("unmatched")
473481

474482
# open the input SAM file and process read groups
@@ -484,7 +492,7 @@ def go(args):
484492
# prepare the alignment outfile
485493
outfile = pysam.AlignmentFile("-", "wh", header=bam_header)
486494

487-
trimmed_segments = {}
495+
trimmed_segments = {x: {} for x in chroms}
488496

489497
# iterate over the alignment segments in the input SAM file
490498
for segment in infile:
@@ -508,10 +516,10 @@ def go(args):
508516

509517
# unpack the trimming tuple since segment passed trimming
510518
amplicon, trimmed_segment = trimming_tuple
511-
trimmed_segments.setdefault(amplicon, [])
519+
trimmed_segments[trimmed_segment.reference_name].setdefault(amplicon, [])
512520

513521
if trimmed_segment:
514-
trimmed_segments[amplicon].append(trimmed_segment)
522+
trimmed_segments[trimmed_segment.reference_name][amplicon].append(trimmed_segment)
515523

516524
# normalise if requested
517525
if args.normalise:
@@ -522,9 +530,9 @@ def go(args):
522530
# write mean amplicon depths to file
523531
if args.amp_depth_report:
524532
with open(args.amp_depth_report, "w") as amp_depth_report_fh:
525-
amp_depth_report_fh.write("amplicon\tmean_depth\n")
526-
for amplicon, depth in mean_amp_depths.items():
527-
amp_depth_report_fh.write(f"{amplicon}\t{depth}\n")
533+
amp_depth_report_fh.write("chrom\tamplicon\tmean_depth\n")
534+
for (chrom, amplicon), depth in mean_amp_depths.items():
535+
amp_depth_report_fh.write(f"{chrom}\t{amplicon}\t{depth}\n")
528536

529537
for output_segment in output_segments:
530538
outfile.write(output_segment)
@@ -554,7 +562,7 @@ def main():
554562
parser.add_argument(
555563
"--primer-match-threshold",
556564
type=int,
557-
default=5,
565+
default=35,
558566
help="Fuzzy match primer positions within this threshold",
559567
)
560568
parser.add_argument("--report", type=str, help="Output report to file")

artic/minion.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -134,20 +134,22 @@ def run(parser, args):
134134
normalise_string = ""
135135

136136
cmds.append(
137-
f"align_trim {normalise_string} {bed} --primer-match-threshold {args.primer_match_threshold} --remove-incorrect-pairs --min-mapq {args.min_mapq} --report {args.sample}.alignreport.csv < {args.sample}.sorted.bam > {args.sample}.trimmed.rg.bam"
137+
f"align_trim {normalise_string} {bed} --primer-match-threshold {args.primer_match_threshold} --remove-incorrect-pairs --min-mapq {args.min_mapq} --report {args.sample}.alignreport.csv < {args.sample}.sorted.bam > {args.sample}.trimmed.rg.sam"
138138
)
139139

140140
cmds.append(
141-
f"samtools sort -T {args.sample} {args.sample}.trimmed.rg.bam -o {args.sample}.trimmed.rg.sorted.bam"
141+
f"samtools sort -T {args.sample} {args.sample}.trimmed.rg.sam -o {args.sample}.trimmed.rg.sorted.bam"
142142
)
143+
cmds.append(f"rm {args.sample}.trimmed.rg.sam")
143144

144145
cmds.append(
145-
f"align_trim {normalise_string} {bed} --primer-match-threshold {args.primer_match_threshold} --min-mapq {args.min_mapq} --remove-incorrect-pairs --trim-primers --report {args.sample}.alignreport.csv --amp-depth-report {args.sample}.amplicon_depths.tsv < {args.sample}.sorted.bam > {args.sample}.primertrimmed.rg.bam"
146+
f"align_trim {normalise_string} {bed} --primer-match-threshold {args.primer_match_threshold} --min-mapq {args.min_mapq} --remove-incorrect-pairs --trim-primers --report {args.sample}.alignreport.csv --amp-depth-report {args.sample}.amplicon_depths.tsv < {args.sample}.sorted.bam > {args.sample}.primertrimmed.rg.sam"
146147
)
147148

148149
cmds.append(
149-
f"samtools sort -T {args.sample} {args.sample}.primertrimmed.rg.bam -o {args.sample}.primertrimmed.rg.sorted.bam"
150+
f"samtools sort -T {args.sample} {args.sample}.primertrimmed.rg.sam -o {args.sample}.primertrimmed.rg.sorted.bam"
150151
)
152+
cmds.append(f"rm {args.sample}.primertrimmed.rg.sam")
151153

152154
cmds.append(f"samtools index {args.sample}.trimmed.rg.sorted.bam")
153155
cmds.append(f"samtools index {args.sample}.primertrimmed.rg.sorted.bam")

artic/utils.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -359,12 +359,12 @@ def read_bed_file(fn):
359359
for _, row in primers.iterrows():
360360
scheme_name, primer_id, direction, primer_n = row["Primer_ID"].split("_")
361361

362-
if (primer_id, direction) not in canonical_primers:
363-
canonical_primers[(primer_id, direction)] = row.to_dict()
362+
if (row["chrom"], primer_id, direction) not in canonical_primers:
363+
canonical_primers[(row["chrom"], primer_id, direction)] = row.to_dict()
364364
continue
365365

366-
canonical_primers[(primer_id, direction)] = merge_sites(
367-
canonical_primers[(primer_id, direction)], row
366+
canonical_primers[(row["chrom"], primer_id, direction)] = merge_sites(
367+
canonical_primers[(row["chrom"], primer_id, direction)], row
368368
)
369369

370370
# return the bedFile as a list

0 commit comments

Comments
 (0)