Skip to content

Commit

Permalink
Correct vcf2psuedogenome and reference_to_single_sequence for multiple
Browse files Browse the repository at this point in the history
  • Loading branch information
antunderwood committed May 8, 2021
1 parent df7fa7c commit b728ca5
Show file tree
Hide file tree
Showing 3 changed files with 12 additions and 8 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
import textwrap
import argparse, sys, os

class ParserWithErrors(argparse.ArgumentParser):
Expand Down Expand Up @@ -33,7 +34,9 @@ def argparser():
def combine_sequences(reference_sequence):
records = list(SeqIO.parse(reference_sequence, "fasta"))
new_sequence = ''.join([str(record.seq) for record in records])
new_id = '-'.join([record.id for record in records])
new_id = '|'.join([record.id for record in records])
if len(new_id) > 100:
new_id = textwrap.shorten(new_id, width=97, placeholder="...")
new_record = SeqRecord(Seq(new_sequence), id = new_id, description = '')
return(new_record)

Expand Down
13 changes: 7 additions & 6 deletions bin/vcf2pseudogenome.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,8 +54,6 @@ def filtered_bcf_to_fasta(filtered_bcf_file, reference_lengths):
previous_positions[chrom] = 0
with VariantFile(filtered_bcf_file) as vcf_reader:
for record in vcf_reader.fetch():
if record.pos % 10000 == 0:
print(record.pos)
record_chrom = record.chrom
if record.pos == previous_positions[record_chrom]: # Insertion - remove previous character and add 'N'
sequences[record_chrom].pop() # remove last position
Expand All @@ -77,8 +75,11 @@ def filtered_bcf_to_fasta(filtered_bcf_file, reference_lengths):
else: # if not PASS it's a low qual SNP so add N
sequences[record_chrom].append('N')
previous_positions[record_chrom] = record.pos
if previous_positions[record_chrom] != reference_lengths[record_chrom]: # if gap at the end
sequences[record_chrom].extend(calculate_gaps_to_add(previous_positions[record_chrom], reference_lengths[record_chrom]))

# check for gaps at end
for chrom in sequences:
if len(sequences[chrom]) != reference_lengths[record_chrom]: # if gap at the end
sequences[chrom].extend(calculate_gaps_to_add(len(sequences[chrom]), reference_lengths[chrom]))
return sequences

def write_sequence(filepath, id, sequence):
Expand All @@ -90,8 +91,8 @@ def write_sequence(filepath, id, sequence):
parser = argparser()
args = parser.parse_args()

reference_length = calculate_reference_lengths(args.reference_file)
pseudogenome_sequence_lists = filtered_bcf_to_fasta(args.filtered_bcf_file, reference_length)
reference_lengths = calculate_reference_lengths(args.reference_file)
pseudogenome_sequence_lists = filtered_bcf_to_fasta(args.filtered_bcf_file, reference_lengths)
pseudogenome_sequences = [ ''.join(sequence_list) for sequence_list in pseudogenome_sequence_lists.values()]
combined_pseudogenome_sequence = ''.join(sequence for sequence in pseudogenome_sequences)

Expand Down
2 changes: 1 addition & 1 deletion modules/local/alignpseudogenomes.nf
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ process ALIGNPSEUDOGENOMES {
echo "\$pseudogenome\t\$fraction_non_GATC_bases" >> low_quality_pseudogenomes.tsv
fi
done
reference_to_single_sequence.py -r ${reference} -o final_reference.fas
reference2single_sequence.py -r ${reference} -o final_reference.fas
cat final_reference.fas >> aligned_pseudogenomes.fas
NUM_ALIGNMENT_GENOMES=\$(grep -c ">" aligned_pseudogenomes.fas)
Expand Down

0 comments on commit b728ca5

Please sign in to comment.