Skip to content

Commit

Permalink
update
Browse files Browse the repository at this point in the history
  • Loading branch information
capoony committed Sep 28, 2021
1 parent 0034cf9 commit 5afbcdd
Showing 1 changed file with 148 additions and 132 deletions.
280 changes: 148 additions & 132 deletions ABBABABA-4AF.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,12 @@
from collections import defaultdict as d
from optparse import OptionParser, OptionGroup

#Author: Martin Kapun
# Author: Martin Kapun

######################################################### HELP #########################################################################
usage="python3 %prog --AF file.af --SNPs 500 --Order 4,5,6,7 --Output file "
usage = "python3 %prog --AF file.af --SNPs 500 --Order 4,5,6,7 --Output file "
parser = OptionParser(usage=usage)
group=OptionGroup(parser,'''
group = OptionGroup(parser, '''
/\
/ \
/\ \
Expand All @@ -22,236 +22,252 @@

######################################################### CODE #########################################################################

parser.add_option("--AF", dest="AF", help="Input file; First three columns: Chrom, Pos, MajorAllele/MinorAllele")
parser.add_option("--AF", dest="AF",
help="Input file; First three columns: Chrom, Pos, MajorAllele/MinorAllele")
parser.add_option("--Output", dest="OUT", help="Output prefix")
parser.add_option("--Order", dest="ORDER", help="column positions of populations H1,H2,H3,H4 in input file")
parser.add_option("--SNPs", dest="SNP", help="number of SNPS",default="NA")
parser.add_option("--window", dest="window", help="window size in bp",default="NA")
parser.add_option("--Order", dest="ORDER",
help="column positions of populations H1,H2,H3,H4 in input file")
parser.add_option("--SNPs", dest="SNP", help="number of SNPS", default="NA")
parser.add_option("--window", dest="window",
help="window size in bp", default="NA")

(options, args) = parser.parse_args()
parser.add_option_group(group)

def D1(x,XL,YL,fXL,fYL,N):

def D1(x, XL, YL, fXL, fYL, N):
'''calculate nominator X and demoninator Y a la Soraggi et al. 2018, assumes X=BABA-ABBA '''
A,B,C,D=x
X=(A-B)*(C-D)
Y=(A+B-2*A*B)*(C+D-2*C*D)
A, B, C, D = x
X = (A - B) * (C - D)
Y = (A + B - 2 * A * B) * (C + D - 2 * C * D)
XL.append(X)
fXL[N].append(X)
YL.append(Y)
fYL[N].append(Y)

def D2(x,XL,YL,fXL,fYL,N):

def D2(x, XL, YL, fXL, fYL, N):
'''calculate nominator X and demoninator Y a la Durand et al 2011, assumes X=ABBA-BABA'''
A,B,C,D=x
X=(1-A)*B*C*(1-D)-A*(1-B)*C*(1-D)
Y=(1-A)*B*C*(1-D)+A*(1-B)*C*(1-D)
A, B, C, D = x
X = (1 - A) * B * C * (1 - D) - A * (1 - B) * C * (1 - D)
Y = (1 - A) * B * C * (1 - D) + A * (1 - B) * C * (1 - D)
XL.append(X)
fXL[N].append(X)
YL.append(Y)
fYL[N].append(Y)


def blockJackEven(fx,fy,D):
def blockJackEven(fx, fy, D):
''' see Busing et al. 1999 '''

## at first retain only windows that don't result in window-wise D="NA"
fxnew=[]
fynew=[]
# at first retain only windows that don't result in window-wise D="NA"
fxnew = []
fynew = []
for i in range(len(D)):
if D[i]== "NA":
if D[i] == "NA":
continue
fxnew.append(fx[i])
fynew.append(fy[i])

## then calculate overall expected D
Dest=sum([sum(x) for x in fxnew])/float(sum([sum(y) for y in fynew]))
# then calculate overall expected D
Dest = sum([sum(x) for x in fxnew]) / float(sum([sum(y) for y in fynew]))

# calculate D when removing one group at a time:
Dmean=[]
Dmean = []
for i in range(len(fxnew)):
fxprime=fxnew[:i] + fxnew[i+1 :]
fyprime=fynew[:i] + fynew[i+1 :]
Dmean.append(sum([sum(x) for x in fxprime])/float(sum([sum(y) for y in fyprime])))
fxprime = fxnew[:i] + fxnew[i + 1:]
fyprime = fynew[:i] + fynew[i + 1:]
Dmean.append(sum([sum(x) for x in fxprime])
/ float(sum([sum(y) for y in fyprime])))

# g is the number of independent groups
g = float(len(fxnew))

## g is the number of independent groups
g=float(len(fxnew))
# caluclate the mean of all group-wise estimates of D
meanJack = sum(Dmean) / float(len(Dmean))

## caluclate the mean of all group-wise estimates of D
meanJack=sum(Dmean)/float(len(Dmean))
# calculate the jackknife estimate and variance of D and z-score
DjackEst = g * Dest - (g - 1) * meanJack
DjackVar = ((g - 1) / g) * sum([(Di - Dest)**2 for Di in Dmean])
Z = Dest / (DjackVar**0.5)

## calculate the jackknife estimate and variance of D and z-score
DjackEst=g*Dest-(g-1)*meanJack
DjackVar=((g-1)/g)*sum([(Di-Dest)**2 for Di in Dmean])
Z=Dest/(DjackVar**0.5)
# return
return Dest, DjackEst, DjackVar, Z

## return
return Dest,DjackEst,DjackVar,Z

def test0(x,y,z):
def test0(x, y, z):
''' calculate the ratio of x and y if bin contains enough SNPs and if y is not 0, then append to list z '''

if sum(y)==0:
xy="NA"
elif options.SNP!="NA" and len(x)!=SNPs:
xy="NA"
if sum(y) == 0:
xy = "NA"
elif options.SNP != "NA" and len(x) != SNPs:
xy = "NA"
else:
xy=str(sum(x)/float(sum(y)))
if float(xy)>1 or float(xy)<-1:
xy="NA"
xy = str(sum(x) / float(sum(y)))
if float(xy) > 1 or float(xy) < -1:
xy = "NA"
z.append(xy)
return xy


def load_data(x):
''' import data either from a gzipped or or uncrompessed file or from STDIN'''
''' import data either from a gzipped or or uncrompessed file or from STDIN'''

import gzip
if x == "-":
y = sys.stdin
elif x.endswith(".gz"):
y = gzip.open(x, "rt", encoding="latin-1")
else:
y = open(x, "r", encoding="latin-1")
return y

import gzip
if x=="-":
y=sys.stdin
elif x.endswith(".gz"):
y=gzip.open(x,"rt", encoding="latin-1")
else:
y=open(x,"r", encoding="latin-1")
return y

if options.SNP != "NA":
SNPs=int(options.SNP)
SNPs = int(options.SNP)
else:
WS=int(options.window)
WS = int(options.window)

ORDER=[int(x) for x in options.ORDER.split(",")]
ORDER = [int(x) for x in options.ORDER.split(",")]

Chr=""
Chr = ""

# Store the position of every SNP
SNPwindows=[]
SNPwindows = []

## to calculate a global value
fSNPX1,fSNPX2=d(list),d(list)
fSNPY1,fSNPY2=d(list),d(list)
# to calculate a global value
fSNPX1, fSNPX2 = d(list), d(list)
fSNPY1, fSNPY2 = d(list), d(list)

## to calculate window-wise values
SNPX1,SNPX2=[],[]
SNPY1,SNPY2=[],[]
# to calculate window-wise values
SNPX1, SNPX2 = [], []
SNPY1, SNPY2 = [], []

## make list to store window-wise values of D
D1L,D2L=[],[]
# make list to store window-wise values of D
D1L, D2L = [], []

## open Outputfile for window-wise estimates
outWindow=open(options.OUT,"w")
# open Outputfile for window-wise estimates
outWindow = open(options.OUT, "w")

## Counter of SNP groups
if options.SNP!="NA":
N=0
# Counter of SNP groups
if options.SNP != "NA":
N = 0
else:
N=""
N = ""

outWindow.write("Chromosome\tStartPos\tAveragePos\tEndPos\tWindowSize\tNoOfSNPs\tSoraggi\tDurand\n")
outWindow.write(
"Chromosome\tStartPos\tAveragePos\tEndPos\tWindowSize\tNoOfSNPs\tSoraggi\tDurand\n")

for l in load_data(options.AF):
a=l.rstrip().split()

if options.window!="NA" and N=="":
N=round(float(a[1])/WS,0)
# skip header line
if l.startswith("#"):
continue

a = l.rstrip().split()

## only keep allele frequencies of populations used for ABBA BABA and retain order
AFs=[a[x] for x in ORDER]
if Chr=="":
Chr=a[0]
if options.window != "NA" and N == "":
N = round(float(a[1]) / WS, 0)

## continue if one AF missing
# only keep allele frequencies of populations used for ABBA BABA and retain order
AFs = [a[x] for x in ORDER]
if Chr == "":
Chr = a[0]

# continue if one AF missing
if "NA" in AFs or "na" in AFs:
continue

AFs=[float(x) for x in AFs]
AFs = [float(x) for x in AFs]

## continue if site is not informative according to Sorragi et al., i.e. if H1==H2 or H3==H4
if AFs[0]==AFs[1] or AFs[2]==AFs[3]:
# continue if site is not informative according to Sorragi et al., i.e. if H1==H2 or H3==H4
if AFs[0] == AFs[1] or AFs[2] == AFs[3]:
continue

# condition AFs for the ancestral allele which is more common in the outgroup population
if AFs[-1]<0.5:
AFs=[1-x for x in AFs]

if AFs[-1] < 0.5:
AFs = [1 - x for x in AFs]

# test if condition for new window:

if options.SNP!="NA":
if len(SNPX1)==SNPs:
cond=True
if options.SNP != "NA":
if len(SNPX1) == SNPs:
cond = True
else:
cond=False
cond = False
else:
if round(float(a[1])/WS,0)!=N:
cond=True
if round(float(a[1]) / WS, 0) != N:
cond = True
else:
cond=False
cond = False

# if the end of a window is reached or a new contig starts, calculate the window-specific D, print it to the outputfile and move on
if Chr!=a[0] or cond:
if Chr != a[0] or cond:

#print(Chr,a[0],len(SNPX1),SNPwindows)
# print(Chr,a[0],len(SNPX1),SNPwindows)

# calculate D
D1s=test0(SNPX1,SNPY1,D1L)
D2s=test0(SNPX2,SNPY2,D2L)
D1s = test0(SNPX1, SNPY1, D1L)
D2s = test0(SNPX2, SNPY2, D2L)

if len(SNPX1)!=0:
START=min(SNPwindows)
END=max(SNPwindows)
MIDDLE=sum(SNPwindows)/float(len(SNPwindows))
LENGTH=END-START
if len(SNPX1) != 0:
START = min(SNPwindows)
END = max(SNPwindows)
MIDDLE = sum(SNPwindows) / float(len(SNPwindows))
LENGTH = END - START

# only print the values of D if the number of SNPs is equivalent to the number of SNPs defined as the binsize
outWindow.write(Chr+"\t"+"\t".join([str(x) for x in [START,MIDDLE,END,LENGTH]])+"\t"+str(len(SNPX1))+"\t"+D1s+"\t"+D2s+"\n")
outWindow.write(Chr + "\t" + "\t".join([str(x) for x in [
START, MIDDLE, END, LENGTH]]) + "\t" + str(len(SNPX1)) + "\t" + D1s + "\t" + D2s + "\n")

# reset the lists
SNPwindows=[]
SNPX1,SNPX2=[],[]
SNPY1,SNPY2=[],[]
Chr=a[0]
SNPwindows = []
SNPX1, SNPX2 = [], []
SNPY1, SNPY2 = [], []
Chr = a[0]

SNPwindows.append(int(a[1]))

# store the numerator and denominator of D
D1([float(x) for x in AFs],SNPX1,SNPY1,fSNPX1,fSNPY1,N)
D2([float(x) for x in AFs],SNPX2,SNPY2,fSNPX2,fSNPY2,N)
D1([float(x) for x in AFs], SNPX1, SNPY1, fSNPX1, fSNPY1, N)
D2([float(x) for x in AFs], SNPX2, SNPY2, fSNPX2, fSNPY2, N)

## increase counter by one to start a new group
if options.SNP!="NA":
N+=1
# increase counter by one to start a new group
if options.SNP != "NA":
N += 1
else:
N=round(float(a[1])/float(options.window),0)
N = round(float(a[1]) / float(options.window), 0)

continue

SNPwindows.append(int(a[1]))
Chr=a[0]
Chr = a[0]

D1([float(x) for x in AFs],SNPX1,SNPY1,fSNPX1,fSNPY1,N)
D2([float(x) for x in AFs],SNPX2,SNPY2,fSNPX2,fSNPY2,N)
D1([float(x) for x in AFs], SNPX1, SNPY1, fSNPX1, fSNPY1, N)
D2([float(x) for x in AFs], SNPX2, SNPY2, fSNPX2, fSNPY2, N)

D1s=test0(SNPX1,SNPY1,D1L)
D2s=test0(SNPX2,SNPY2,D2L)
D1s = test0(SNPX1, SNPY1, D1L)
D2s = test0(SNPX2, SNPY2, D2L)

if len(SNPX1)!=0:
if len(SNPX1) != 0:

START=min(SNPwindows)
END=max(SNPwindows)
MIDDLE=sum(SNPwindows)/float(len(SNPwindows))
LENGTH=END-START
START = min(SNPwindows)
END = max(SNPwindows)
MIDDLE = sum(SNPwindows) / float(len(SNPwindows))
LENGTH = END - START

outWindow.write(Chr+"\t"+"\t".join([str(x) for x in [START,MIDDLE,END,LENGTH]])+"\t"+str(len(SNPX1))+"\t"+D1s+"\t"+D2s+"\n")
outWindow.write(Chr + "\t" + "\t".join([str(x) for x in [
START, MIDDLE, END, LENGTH]]) + "\t" + str(len(SNPX1)) + "\t" + D1s + "\t" + D2s + "\n")


## open Outputfile for genome-wide estimates
outGW=open(options.OUT+"_GenomeWide.D","w")
# open Outputfile for genome-wide estimates
outGW = open(options.OUT + "_GenomeWide.D", "w")

## calculate Jacknife stat
# calculate Jacknife stat
outGW.write("Method\tDest\tDjackEst\tDjackVar\tZ\n")

Z1=blockJackEven(fSNPX1,fSNPY1,D1L)
outGW.write("SoraggiEtAl\t"+"\t".join([str(x) for x in Z1])+"\n")
Z1 = blockJackEven(fSNPX1, fSNPY1, D1L)
outGW.write("SoraggiEtAl\t" + "\t".join([str(x) for x in Z1]) + "\n")

Z2=blockJackEven(fSNPX2,fSNPY2,D2L)
outGW.write("DurandEtAl\t"+"\t".join([str(x) for x in Z2])+"\n")
Z2 = blockJackEven(fSNPX2, fSNPY2, D2L)
outGW.write("DurandEtAl\t" + "\t".join([str(x) for x in Z2]) + "\n")

0 comments on commit 5afbcdd

Please sign in to comment.