Skip to content

Commit

Permalink
updates
Browse files Browse the repository at this point in the history
  • Loading branch information
capoony committed Sep 28, 2021
1 parent 5afbcdd commit 4d05678
Showing 1 changed file with 59 additions and 53 deletions.
112 changes: 59 additions & 53 deletions SYNC2AF.py
Original file line number Diff line number Diff line change
@@ -1,99 +1,105 @@
import sys
from collections import defaultdict as d
from optparse import OptionParser, OptionGroup

#Author: Martin Kapun
# Author: Martin Kapun

######################################################### HELP #########################################################################
usage="python %prog --sync input.sync > output.af "
usage = "python %prog --sync input.sync > output.af "
parser = OptionParser(usage=usage)
group=OptionGroup(parser,'''
group = OptionGroup(parser, '''
''')

######################################################### CODE #########################################################################

parser.add_option("--sync", dest="IN", help="Input file")
parser.add_option("--MinCov", dest="mc", help="minor coverage threshold",default=10)
parser.add_option("--MinCov", dest="mc",
help="minor coverage threshold", default=10)

(options, args) = parser.parse_args()
parser.add_option_group(group)

def sync2freqh(x,ALL,mc):

def sync2freqh(x, ALL, mc):
''' convert string in SYNC format to dictionary of freqencies where x is a string in sync format'''
from collections import defaultdict as d
if x==".:.:.:.:.:." or x=="0:0:0:0:0:0":
return "na","na"
nuc=["A","T","C","G"]
counts=[int(X) for X in x.split(":")[:4]]
if sum(counts)==0:
return ({"A":0.0,"T":0.0,"C":0.0,"G":0.0},0)
CO={k:v for k,v in list(zip(*[nuc,counts]))if k in ALL}

h=d(float)
if sum(CO.values())<mc:
return "na","na"
for k,v in CO.items():
h[k]=v/float(sum(CO.values()))

return h,sum(CO.values())
if x == ".:.:.:.:.:." or x == "0:0:0:0:0:0":
return "na", "na"
nuc = ["A", "T", "C", "G"]
counts = [int(X) for X in x.split(":")[:4]]
if sum(counts) == 0:
return ({"A": 0.0, "T": 0.0, "C": 0.0, "G": 0.0}, 0)
CO = {k: v for k, v in list(zip(*[nuc, counts]))if k in ALL}

h = d(float)
if sum(CO.values()) < mc:
return "na", "na"
for k, v in CO.items():
h[k] = v / float(sum(CO.values()))

return h, sum(CO.values())


def all_alleles(v):
''' returns most common strings'''
from collections import Counter
nuc=v.replace("N","")
if len(nuc)==0 or len(set(nuc))<2:
nuc = v.replace("N", "")
if len(nuc) == 0 or len(set(nuc)) < 2:
return "NA"
return list(zip(*Counter(nuc).most_common()))[0]


def sync2string(x):
''' convert sync format to string of nucleotides where x is a string in sync format '''
string=""
alleles=["A","T","C","G"]
if x==".:.:.:.:.:." or x=="0:0:0:0:0:0":
string = ""
alleles = ["A", "T", "C", "G"]
if x == ".:.:.:.:.:." or x == "0:0:0:0:0:0":
return "na"
ah=dict(zip(alleles,[int(X) for X in x.split(":")[:4]]))
for k,v in ah.items():
string+=v*k
ah = dict(zip(alleles, [int(X) for X in x.split(":")[:4]]))
for k, v in ah.items():
string += v * k
return string


def load_data(x):
''' import data either from a gzipped or or uncrompessed file or from STDIN'''
import gzip
if x=="-":
y=sys.stdin
elif x.endswith(".gz"):
y=gzip.open(x,"rt", encoding="latin-1")
else:
y=open(x,"r", encoding="latin-1")
return y
''' import data either from a gzipped or or uncrompessed file or from STDIN'''
import gzip
if x == "-":
y = sys.stdin
elif x.endswith(".gz"):
y = gzip.open(x, "rt", encoding="latin-1")
else:
y = open(x, "r", encoding="latin-1")
return y


for l in load_data(options.IN):

# split in columns
a=l.rstrip().split()
ID=a[0]+":"+a[1]
a = l.rstrip().split()
ID = a[0] + ":" + a[1]

## make long string of all summed-up alleles across all populations
AlleleStrings=""
# make long string of all summed-up alleles across all populations
AlleleStrings = ""
for pop in a[3:]:
Counts=sync2string(pop)
if Counts=="na":
Counts = sync2string(pop)
if Counts == "na":
continue
AlleleStrings+=Counts
AlleleStrings += Counts

## identify two most common alleles
MA=all_alleles(AlleleStrings)[:2]
# identify two most common alleles
MA = all_alleles(AlleleStrings)[:2]

## skip if position invariate
if MA=="NA":
# skip if position invariate
if MA == "NA":
continue
fl=[]
fl = []

## calculate and print frequencies of major alleles
# calculate and print frequencies of major alleles
for pop in a[3:]:
FH=sync2freqh(pop,MA,int(options.mc))
FH = sync2freqh(pop, MA, int(options.mc))
if "na" in FH:
fl.append("NA")
continue
fl.append(FH[0][MA[0]])
print("\t".join(a[:2])+"\t"+"/".join(MA[:2])+"\t"+"\t".join([str(x) for x in fl]))
print("\t".join(a[:2]) + "\t" + "/".join(MA[:2])
+ "\t" + "\t".join([str(x) for x in fl]))

0 comments on commit 4d05678

Please sign in to comment.