Skip to content

Commit

Permalink
code cleaning
Browse files Browse the repository at this point in the history
release prep
  • Loading branch information
ACEnglish committed Jan 5, 2024
1 parent b93842c commit 9da5ef0
Show file tree
Hide file tree
Showing 4 changed files with 47 additions and 48 deletions.
6 changes: 3 additions & 3 deletions truvari/collapse.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,7 @@ def collapse_chunk(chunk, matcher):
if matcher.no_consolidate:
for val in ret:
if matcher.gt:
edited_entry, collapse_cnt = super_consolidate(
edited_entry, collapse_cnt = gt_aware_consolidate(
val.entry, val.matches)
else:
edited_entry, collapse_cnt = collapse_into_entry(
Expand Down Expand Up @@ -229,7 +229,7 @@ def fmt_none(value):
return value


def super_consolidate(entry, others):
def gt_aware_consolidate(entry, others):
"""
All formats are consolidated (first one taken)
And two hets consolidated become hom
Expand Down Expand Up @@ -729,12 +729,12 @@ def collapse_main(args):
regions = truvari.RegionVCFIterator(base, includebed=args.bed)
regions.merge_overlaps()
base_i = regions.iterate(base)
outputs = CollapseOutput(args)

chunks = truvari.chunker(matcher, ('base', base_i))
smaller_chunks = tree_size_chunker(matcher, chunks)
even_smaller_chunks = tree_dist_chunker(matcher, smaller_chunks)

outputs = CollapseOutput(args)
m_collap = partial(collapse_chunk, matcher=matcher)
for call in itertools.chain.from_iterable(map(m_collap, even_smaller_chunks)):
outputs.write(call, args.median_info)
Expand Down
82 changes: 42 additions & 40 deletions truvari/msatovcf.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,15 +44,52 @@ def var_to_str(v):
return [var_to_str(del_var), var_to_str(in_var)]


def aln_to_vars(chrom, start_pos, ref_seq, alt_seq, anchor_base):
"""
Zip the bases of an alignment and turn into variants
"""
cur_variant = []
cur_pos = start_pos
# This is too long. need to have a separate zip method
for ref_base, alt_base in zip(ref_seq, alt_seq):
is_ref = ref_base != '-'
if ref_base == '-':
ref_base = ""
if alt_base == '-':
alt_base = ""

# gap on gap
if not ref_base and not alt_base:
continue

if ref_base == alt_base: # No variant
if cur_variant and is_ref: # back to matching reference
for variant in decompose_variant(cur_variant):
yield variant
cur_variant = []
else:
if not cur_variant:
# -1 for the anchor base we're forcing on
cur_variant = [chrom, cur_pos - 1, '.', anchor_base + ref_base,
anchor_base + alt_base, '.', '.', '.', 'GT']
else:
cur_variant[REFIDX] += ref_base
cur_variant[ALTIDX] += alt_base
if is_ref:
cur_pos += 1
anchor_base = ref_base
# End Zipping
if cur_variant:
for variant in decompose_variant(cur_variant):
yield variant

def msa_to_vars(msa, chrom, ref_seq=None, start_pos=0, abs_anchor_base='N'):
"""
Turn MSA into VCF entries and their presence in samples
returns list of sample names parsed and dictionary of variant : samples containing the variant
"""
sample_names = set()

final_vars = defaultdict(list)

for alt_key in msa.keys():
if alt_key.startswith("ref_"):
continue
Expand All @@ -68,44 +105,9 @@ def msa_to_vars(msa, chrom, ref_seq=None, start_pos=0, abs_anchor_base='N'):
alt_seq = msa[alt_key].upper()

anchor_base = ref_seq[0] if ref_seq[0] != '-' else abs_anchor_base

cur_variant = []
cur_pos = start_pos
# This is too long. need to have a separate zip method
for ref_base, alt_base in zip(ref_seq, alt_seq):
is_ref = ref_base != '-'
if ref_base == '-':
ref_base = ""
if alt_base == '-':
alt_base = ""

# nothing to compare
if not ref_base and not alt_base:
continue

if ref_base == alt_base: # No variant
if cur_variant and is_ref: # back to matching reference
for key in decompose_variant(cur_variant):
final_vars[key].append(cur_samp_hap)
cur_variant = []
else:
if not cur_variant:
# -1 for the anchor base we're forcing on
cur_variant = [chrom, cur_pos - 1, '.', anchor_base + ref_base,
anchor_base + alt_base, '.', '.', '.', 'GT']
else:
cur_variant[REFIDX] += ref_base
cur_variant[ALTIDX] += alt_base
if is_ref:
cur_pos += 1
anchor_base = ref_base
# End Zipping
if cur_variant:
for key in decompose_variant(cur_variant):
final_vars[key].append(cur_samp_hap)
# End alignment
sample_names = sorted(list(sample_names))
return sample_names, final_vars
for variant in aln_to_vars(chrom, start_pos, ref_seq, alt_seq, anchor_base):
final_vars[variant].append(cur_samp_hap)
return sorted(list(sample_names)), final_vars


def make_vcf(variants, sample_names):
Expand Down
6 changes: 2 additions & 4 deletions truvari/phab.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,7 @@ def incorporate(consensus_sequence, entry, correction):
alt_len = len(entry.alts[0]) if entry.alts else 0
if entry.alts[0] == '*':
return correction
# Need to check it doesn't overlap previous position
position = entry.pos + correction
consensus_sequence[position:position + ref_len] = list(entry.alts[0])
return correction + (alt_len - ref_len)
Expand All @@ -120,14 +121,11 @@ def make_consensus(data, ref_fn):
if entry.start < start or entry.stop > end:
continue
if entry.samples[sample]['GT'][0] == 1:
# Checks - doesn't overlap previous position
correction[0] = incorporate(haps[0], entry, correction[0])
if len(entry.samples[sample]['GT']) > 1 and entry.samples[sample]['GT'][1] == 1:
# Checks - doesn't overlap previous position
correction[1] = incorporate(haps[1], entry, correction[1])
# turn into fasta.
ret[ref] = f">{o_samp}_1_{ref}\n{''.join(haps[0])}\n>{o_samp}_2_{ref}\n{''.join(haps[1])}\n".encode(
)
ret[ref] = f">{o_samp}_1_{ref}\n{''.join(haps[0])}\n>{o_samp}_2_{ref}\n{''.join(haps[1])}\n".encode()
return ret


Expand Down
1 change: 0 additions & 1 deletion truvari/stratify.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,6 @@ def benchdir_count_entries(benchdir, regions, within=False, threads=4):
method = partial(count_entries, chroms=chroms,
regions=intvs, within=within)
data = {}
# , maxtasksperchild=1) as pool:
with multiprocessing.Pool(threads) as pool:
for name, counts in zip(names, pool.map(method, vcfs)):
data[name] = counts
Expand Down

0 comments on commit 9da5ef0

Please sign in to comment.