Skip to content

Commit

Permalink
tweaks
Browse files Browse the repository at this point in the history
bnd positions in decomposition are corrected around anchor base
4x unroll seqsim to check all possibilities
break ties in matches by start/end distance (rare)
  • Loading branch information
ACEnglish committed Jan 30, 2025
1 parent 664fd5b commit 839be09
Show file tree
Hide file tree
Showing 12 changed files with 31 additions and 28 deletions.
Binary file modified repo_utils/answer_key/bench/bench_bnd_decomp/fn.vcf.gz
Binary file not shown.
Binary file modified repo_utils/answer_key/bench/bench_bnd_decomp/fn.vcf.gz.tbi
Binary file not shown.
Binary file modified repo_utils/answer_key/bench/bench_bnd_decomp/fp.vcf.gz
Binary file not shown.
Binary file modified repo_utils/answer_key/bench/bench_bnd_decomp/fp.vcf.gz.tbi
Binary file not shown.
16 changes: 8 additions & 8 deletions repo_utils/answer_key/bench/bench_bnd_decomp/log.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
2025-01-29 01:41:36,655 [INFO] Truvari v5.0.1.dev0+24859cf.uc
2025-01-29 01:41:36,657 [INFO] Command /data/truvari/__main__.py bench -b repo_utils/test_files/variants/bnd.base.vcf.gz -c repo_utils/test_files/variants/bnd.comp2.vcf.gz --sizemax 1000000000 -p 0 --pick multi -o test_results/bench_bnd_decomp/
2025-01-29 01:41:36,658 [INFO] Params:
2025-01-30 21:40:22,568 [INFO] Truvari v5.0.1.dev0+664fd5b
2025-01-30 21:40:22,569 [INFO] Command /data/truvari/__main__.py bench -b repo_utils/test_files/variants/bnd.base.vcf.gz -c repo_utils/test_files/variants/bnd.comp2.vcf.gz --sizemax 1000000000 -p 0 --pick multi -o test_results/bench_bnd_decomp/
2025-01-30 21:40:22,570 [INFO] Params:
{
"base": "/data/repo_utils/test_files/variants/bnd.base.vcf.gz",
"comp": "/data/repo_utils/test_files/variants/bnd.comp2.vcf.gz",
Expand Down Expand Up @@ -36,10 +36,10 @@
"skip_gt": false,
"max_resolve": 25000
}
2025-01-29 01:41:36,746 [WARNING] 193 contigs present in comparison VCF header are not in baseline VCF.
2025-01-29 01:41:58,627 [INFO] Zipped 30098 variants Counter({'comp': 29902, 'base': 196})
2025-01-29 01:41:58,627 [INFO] 20660 chunks of 30098 variants Counter({'comp': 29902, 'base': 195, '__filtered': 1})
2025-01-29 01:42:03,703 [INFO] Stats: {
2025-01-30 21:40:22,652 [WARNING] 193 contigs present in comparison VCF header are not in baseline VCF.
2025-01-30 21:40:43,051 [INFO] Zipped 30098 variants Counter({'comp': 29902, 'base': 196})
2025-01-30 21:40:43,051 [INFO] 20660 chunks of 30098 variants Counter({'comp': 29902, 'base': 195, '__filtered': 1})
2025-01-30 21:40:47,691 [INFO] Stats: {
"TP-base": 152,
"TP-comp": 131,
"FP": 28401,
Expand Down Expand Up @@ -79,4 +79,4 @@
}
}
}
2025-01-29 01:42:03,704 [INFO] Finished bench
2025-01-30 21:40:47,692 [INFO] Finished bench
Binary file modified repo_utils/answer_key/bench/bench_bnd_decomp/tp-base.vcf.gz
Binary file not shown.
Binary file modified repo_utils/answer_key/bench/bench_bnd_decomp/tp-base.vcf.gz.tbi
Binary file not shown.
Binary file modified repo_utils/answer_key/bench/bench_bnd_decomp/tp-comp.vcf.gz
Binary file not shown.
Binary file modified repo_utils/answer_key/bench/bench_bnd_decomp/tp-comp.vcf.gz.tbi
Binary file not shown.
5 changes: 4 additions & 1 deletion truvari/comparisons.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,10 @@ def best_seqsim(a_seq, b_seq, st_dist):
else:
rssm = 0
return max(rssm, unroll_seqsim(a_seq, b_seq, st_dist),
unroll_seqsim(b_seq, a_seq, -st_dist), seqsim(a_seq, b_seq))
unroll_seqsim(a_seq, b_seq, -st_dist),
unroll_seqsim(b_seq, a_seq, st_dist),
unroll_seqsim(b_seq, a_seq, -st_dist),
seqsim(a_seq, b_seq))


def roll_seqsim(a_seq, b_seq):
Expand Down
17 changes: 10 additions & 7 deletions truvari/matching.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ def __init__(self):
self.gt_match = None
self.multi = None
self.state = False
self.score = None
self.score = 0

def calc_score(self):
"""
Expand All @@ -76,12 +76,15 @@ def calc_score(self):
self.score = (self.seqsim + self.sizesim + self.ovlpct) / 3.0 * 100

def __lt__(self, other):
# Trues are always worth more
if self.state != other.state:
return self.state < other.state
m_score = self.score if self.score is not None else -float('inf')
o_score = other.score if other.score is not None else -float('inf')
return m_score < o_score
def s_abs(value):
"""
Negative because we want them to be closer
"""
return -abs(value) if value is not None else -float('inf')
return (
(self.state, self.score, s_abs(self.st_dist), s_abs(self.ed_dist)) <
(other.state, other.score, s_abs(other.st_dist), s_abs(other.ed_dist))
)

def __eq__(self, other):
return self.state == other.state and self.score == other.score
Expand Down
21 changes: 9 additions & 12 deletions truvari/variant_record.py
Original file line number Diff line number Diff line change
Expand Up @@ -313,11 +313,12 @@ def decompose(self):

if svtype == truvari.SV.INV:
record1 = self.copy()
record1.alts = (f"[{self.chrom}:{self.end}[N",)
record1.alts = (f"N]{self.chrom}:{self.end}]",)
record1.info["SVTYPE"] = "BND"

record2 = self.copy()
record2.alts = (f"N]{self.chrom}:{self.end}]",)
record2.pos += 1
record2.alts = (f"[{self.chrom}:{self.end + 1}[N",)
record2.info["SVTYPE"] = "BND"

record3 = self.copy()
Expand All @@ -326,19 +327,19 @@ def decompose(self):
record3.info["SVTYPE"] = "BND"

record4 = self.copy()
record4.pos = self.end
record4.alts = (f"[{self.chrom}:{self.pos}[N",)
record4.pos = self.end + 1
record4.alts = (f"[{self.chrom}:{self.pos + 1}[N",)
record4.info["SVTYPE"] = "BND"

ret = [record1, record2, record3, record4]

elif svtype == truvari.SV.DEL:
record1 = self.copy()
record1.alts = (f"N[{self.chrom}:{self.end}[",)
record1.alts = (f"N[{self.chrom}:{self.end + 1}[",)
record1.info["SVTYPE"] = "BND"

record2 = self.copy()
record2.pos = self.end
record2.pos = self.end + 1
record2.alts = (f"]{self.chrom}:{self.pos}]N",)
record2.info["SVTYPE"] = "BND"

Expand All @@ -347,12 +348,13 @@ def decompose(self):
elif svtype == truvari.SV.DUP:
# Assumes DUP:TANDEM
record1 = self.copy()
record1.pos += 1
record1.alts = (f"]{self.chrom}:{self.end}]N",)
record1.info["SVTYPE"] = "BND"

record2 = self.copy()
record2.pos = self.end
record2.alts = (f"N[{self.chrom}:{self.pos}[",)
record2.alts = (f"N[{self.chrom}:{self.pos + 1}[",)
record2.info["SVTYPE"] = "BND"

ret = [record1, record2]
Expand Down Expand Up @@ -734,11 +736,6 @@ def seqsim(self, other, roll=True):
if not roll or st_dist == 0 or ed_dist == 0:
return truvari.seqsim(a_seq, b_seq)

if st_dist < 0:
st_dist *= -1
else:
a_seq, b_seq = b_seq, a_seq

# Return best of rolled, unrolled from both ends, and direct similarity
# Whichever is highest is how similar these sequences can be
return truvari.best_seqsim(a_seq, b_seq, st_dist)
Expand Down

0 comments on commit 839be09

Please sign in to comment.