forked from capoony/ABBABABA-4AF
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
1 changed file
with
59 additions
and
53 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,99 +1,105 @@ | ||
import sys | ||
from collections import defaultdict as d | ||
from optparse import OptionParser, OptionGroup | ||
|
||
#Author: Martin Kapun | ||
# Author: Martin Kapun | ||
|
||
######################################################### HELP ######################################################################### | ||
usage="python %prog --sync input.sync > output.af " | ||
usage = "python %prog --sync input.sync > output.af " | ||
parser = OptionParser(usage=usage) | ||
group=OptionGroup(parser,''' | ||
group = OptionGroup(parser, ''' | ||
''') | ||
|
||
######################################################### CODE ######################################################################### | ||
|
||
parser.add_option("--sync", dest="IN", help="Input file") | ||
parser.add_option("--MinCov", dest="mc", help="minor coverage threshold",default=10) | ||
parser.add_option("--MinCov", dest="mc", | ||
help="minor coverage threshold", default=10) | ||
|
||
(options, args) = parser.parse_args() | ||
parser.add_option_group(group) | ||
|
||
def sync2freqh(x,ALL,mc): | ||
|
||
def sync2freqh(x, ALL, mc): | ||
''' convert string in SYNC format to dictionary of freqencies where x is a string in sync format''' | ||
from collections import defaultdict as d | ||
if x==".:.:.:.:.:." or x=="0:0:0:0:0:0": | ||
return "na","na" | ||
nuc=["A","T","C","G"] | ||
counts=[int(X) for X in x.split(":")[:4]] | ||
if sum(counts)==0: | ||
return ({"A":0.0,"T":0.0,"C":0.0,"G":0.0},0) | ||
CO={k:v for k,v in list(zip(*[nuc,counts]))if k in ALL} | ||
|
||
h=d(float) | ||
if sum(CO.values())<mc: | ||
return "na","na" | ||
for k,v in CO.items(): | ||
h[k]=v/float(sum(CO.values())) | ||
|
||
return h,sum(CO.values()) | ||
if x == ".:.:.:.:.:." or x == "0:0:0:0:0:0": | ||
return "na", "na" | ||
nuc = ["A", "T", "C", "G"] | ||
counts = [int(X) for X in x.split(":")[:4]] | ||
if sum(counts) == 0: | ||
return ({"A": 0.0, "T": 0.0, "C": 0.0, "G": 0.0}, 0) | ||
CO = {k: v for k, v in list(zip(*[nuc, counts]))if k in ALL} | ||
|
||
h = d(float) | ||
if sum(CO.values()) < mc: | ||
return "na", "na" | ||
for k, v in CO.items(): | ||
h[k] = v / float(sum(CO.values())) | ||
|
||
return h, sum(CO.values()) | ||
|
||
|
||
def all_alleles(v): | ||
''' returns most common strings''' | ||
from collections import Counter | ||
nuc=v.replace("N","") | ||
if len(nuc)==0 or len(set(nuc))<2: | ||
nuc = v.replace("N", "") | ||
if len(nuc) == 0 or len(set(nuc)) < 2: | ||
return "NA" | ||
return list(zip(*Counter(nuc).most_common()))[0] | ||
|
||
|
||
def sync2string(x): | ||
''' convert sync format to string of nucleotides where x is a string in sync format ''' | ||
string="" | ||
alleles=["A","T","C","G"] | ||
if x==".:.:.:.:.:." or x=="0:0:0:0:0:0": | ||
string = "" | ||
alleles = ["A", "T", "C", "G"] | ||
if x == ".:.:.:.:.:." or x == "0:0:0:0:0:0": | ||
return "na" | ||
ah=dict(zip(alleles,[int(X) for X in x.split(":")[:4]])) | ||
for k,v in ah.items(): | ||
string+=v*k | ||
ah = dict(zip(alleles, [int(X) for X in x.split(":")[:4]])) | ||
for k, v in ah.items(): | ||
string += v * k | ||
return string | ||
|
||
|
||
def load_data(x): | ||
''' import data either from a gzipped or or uncrompessed file or from STDIN''' | ||
import gzip | ||
if x=="-": | ||
y=sys.stdin | ||
elif x.endswith(".gz"): | ||
y=gzip.open(x,"rt", encoding="latin-1") | ||
else: | ||
y=open(x,"r", encoding="latin-1") | ||
return y | ||
''' import data either from a gzipped or or uncrompessed file or from STDIN''' | ||
import gzip | ||
if x == "-": | ||
y = sys.stdin | ||
elif x.endswith(".gz"): | ||
y = gzip.open(x, "rt", encoding="latin-1") | ||
else: | ||
y = open(x, "r", encoding="latin-1") | ||
return y | ||
|
||
|
||
for l in load_data(options.IN): | ||
|
||
# split in columns | ||
a=l.rstrip().split() | ||
ID=a[0]+":"+a[1] | ||
a = l.rstrip().split() | ||
ID = a[0] + ":" + a[1] | ||
|
||
## make long string of all summed-up alleles across all populations | ||
AlleleStrings="" | ||
# make long string of all summed-up alleles across all populations | ||
AlleleStrings = "" | ||
for pop in a[3:]: | ||
Counts=sync2string(pop) | ||
if Counts=="na": | ||
Counts = sync2string(pop) | ||
if Counts == "na": | ||
continue | ||
AlleleStrings+=Counts | ||
AlleleStrings += Counts | ||
|
||
## identify two most common alleles | ||
MA=all_alleles(AlleleStrings)[:2] | ||
# identify two most common alleles | ||
MA = all_alleles(AlleleStrings)[:2] | ||
|
||
## skip if position invariate | ||
if MA=="NA": | ||
# skip if position invariate | ||
if MA == "NA": | ||
continue | ||
fl=[] | ||
fl = [] | ||
|
||
## calculate and print frequencies of major alleles | ||
# calculate and print frequencies of major alleles | ||
for pop in a[3:]: | ||
FH=sync2freqh(pop,MA,int(options.mc)) | ||
FH = sync2freqh(pop, MA, int(options.mc)) | ||
if "na" in FH: | ||
fl.append("NA") | ||
continue | ||
fl.append(FH[0][MA[0]]) | ||
print("\t".join(a[:2])+"\t"+"/".join(MA[:2])+"\t"+"\t".join([str(x) for x in fl])) | ||
print("\t".join(a[:2]) + "\t" + "/".join(MA[:2]) | ||
+ "\t" + "\t".join([str(x) for x in fl])) |