-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathcmm.py
306 lines (247 loc) · 9.62 KB
/
cmm.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
# Main file for usage of CMM (Couple Mixed Model)
# If you find this work useful, please cite:
# Wang H., Pei F., Vanyukov MM., Bahar I., Wu W. and Xing EP.
# Coupled Mixed Model for Joint GWAS of Pairs of Complex Disorders with Independently Collected Datasets
# Contact: {haohanw,weiwu2,epxing}@cs.cmu.edu
import sys
import numpy as np
from utility.dataLoader import FileReader
from utility.matchSNPList import matchSNPs
from model.CMM import CMM
def printOutHead(out): out.write("\t".join(["RANK", "SNP_ID", "EFFECT_SIZE_ABS"]) + "\n")
def outputResult(out, rank, id, beta):
out.write("\t".join([str(x) for x in [rank, id, beta]]) + "\n")
from optparse import OptionParser, OptionGroup
usage = """usage: %prog [options] --file1 fileName1 --file2 fileName2
This program provides the basic usage to CMM, e.g:
python cmm.py --file1 data/mice1.plink --file2 data/mice2.plink
"""
parser = OptionParser(usage=usage)
dataGroup = OptionGroup(parser, "Data Options")
modelGroup = OptionGroup(parser, "Model Options")
## data options
dataGroup.add_option("-t", dest='fileType', default='plink', help="choices of input file type")
dataGroup.add_option("--file1", dest='fileName1', help="name of the input file for one data set")
dataGroup.add_option("--file2", dest='fileName2', help="name of the input file for the other data set")
## model options
modelGroup.add_option("--lambda", dest="lmbd", default=None,
help="the weight of the penalizer. If neither lambda or snum is given, cross validation will be run.")
modelGroup.add_option("--rho", dest="rho", default=1e10,
help="the weight of the penalizer for common support. If neither lambda or snum is given, cross validation will be run.")
modelGroup.add_option("--snum", dest="snum", default=None,
help="the number of targeted variables the model selects. If neither lambda or snum is given, cross validation will be run. "
"We highly recommend users to specify the numbers to select, "
"then the program will run to make sure there are at least some common snps to be selected for these phenotypes.")
modelGroup.add_option('-q', action='store_true', dest='quiet', default=False, help='Run in quiet mode')
modelGroup.add_option('-m', action='store_true', dest='missing', default=False,
help='Run without missing genotype imputation')
## advanced options
parser.add_option_group(dataGroup)
parser.add_option_group(modelGroup)
(options, args) = parser.parse_args()
learningRate = 1e-5
min_lambda_default = 1e-5
max_lambda_default = 1e5
minFactor = 0.6
maxFactor = 1.5
maxIteration = 50
patience = 5
def cross_val_score(clf, X1, y1, X2, y2, cv=5, learningRate=1):
scores = []
[n1, p] = X1.shape
[n2, p] = X2.shape
b1 = n1 / cv
b2 = n2 / cv
for i in range(cv):
ind1 = np.arange(b1) + b1 * i
Xtr1 = np.delete(X1, ind1, axis=0)
ytr1 = np.delete(y1, ind1, axis=0)
Xte1 = X1[ind1, :]
yte1 = y1[ind1]
ind2 = np.arange(b2) + b2 * i
Xtr2 = np.delete(X2, ind2, axis=0)
ytr2 = np.delete(y2, ind2, axis=0)
Xte2 = X1[ind2, :]
yte2 = y1[ind2]
clf.setLearningRate(learningRate)
clf.fit(Xtr1, ytr1, Xtr2, ytr2)
ypr1 = clf.predict1(Xte1)
ypr2 = clf.predict2(Xte2)
s = np.mean(np.square(ypr1 - yte1)) + np.mean(np.square(ypr2 - yte2))
scores.append(s)
return scores
def crossValidation(clf, X1, y1, X2, y2, learningRate):
minError = np.inf
minLam = 0
minRho = 0
for r in range(-10, 10):
rho = 10.0**r
for i in range(-10, 10):
lam = 10.0**i
clf.setLambda(lam)
clf.setRho(rho)
scores = cross_val_score(clf, X1, y1, X2, y2, cv=5, learningRate=learningRate)
score = np.mean(np.abs(scores))
if score < minError:
minError = score
minLam = lam
minRho = rho
clf.setLambda(minLam)
clf.setRho(minRho)
clf.setLearningRate(learningRate)
clf.fit(X1, y1, X2, y2)
beta1 = clf.getBeta1()
beta2 = clf.getBeta1()
return beta1, beta2
def binarySearchJoint_DoubleLambda(model, X1, y1, X2, y2,
select_num, lrScale, quiet=True):
betaM1 = np.zeros([X1.shape[1]])
betaM2 = np.zeros([X2.shape[1]])
min_lambda1 = min_lambda_default
max_lambda1 = max_lambda_default
min_lambda2 = min_lambda_default
max_lambda2 = max_lambda_default
stuckCount = 1
previousC = -1
iteration = 0
while ((min_lambda1 < max_lambda1) or (min_lambda2 < max_lambda2)) and iteration < maxIteration:
iteration += 1
breakFlag = False
if min_lambda1 < max_lambda1:
lmbd1 = np.exp((np.log(min_lambda1) + np.log(max_lambda1)) / 2.0)
if min_lambda2 < max_lambda2:
lmbd2 = np.exp((np.log(min_lambda2) + np.log(max_lambda2)) / 2.0)
if not quiet:
print "\t\tIter:{}\tlambda1:{}\tlambda2:{}".format(iteration, lmbd1, lmbd2),
model.setLambda1(lmbd1)
model.setLambda2(lmbd2)
model.setLearningRate(learningRate * lrScale) # learning rate must be set again every time we run it.
model.fit(X1, y1, X2, y2)
beta1 = model.getBeta1()
beta2 = model.getBeta2()
c1 = len(np.where(np.abs(beta1) > 0)[0])
c2 = len(np.where(np.abs(beta2) > 0)[0])
c0 = len(np.where(np.minimum(np.abs(beta1), np.abs(beta2))!=0)[0])
if not quiet:
print "# Chosen:{} and {} with common {}".format(c1, c2, c0)
if c1 > select_num * minFactor and c2 > select_num * minFactor and c0 == 0:
return beta1, beta2
if c1 < select_num * minFactor: # Regularizer too strong
max_lambda1 = lmbd1
elif c1 > select_num * maxFactor: # Regularizer too weak
min_lambda1 = lmbd1
else:
breakFlag = True
if c2 < select_num * minFactor: # Regularizer too strong
max_lambda2 = lmbd2
elif c2 > select_num * maxFactor: # Regularizer too weak
min_lambda2 = lmbd2
else:
if breakFlag:
betaM1 = beta1
betaM2 = beta2
break
betaM1 = beta1
betaM2 = beta2
if c1 + c2 == previousC:
stuckCount += 1
else:
previousC = c1 + c2
stuckCount = 1
if stuckCount > patience:
if not quiet:
print 'Run out of patience'
break
return betaM1, betaM2
def binarySearch_Rho(model, X1, y1, X2, y2, snum, quiet):
minRho = 1e0
maxRho = 1e15
maxMainIteration = 10
mainIteration = 0
Beta1 = None
Beta2 = None
while mainIteration < maxMainIteration and minRho < maxRho:
rho = np.exp((np.log(minRho) + np.log(maxRho)) / 2.0)
if not quiet:
print '-------------------------------'
print 'RHO:', rho
model.setRho(rho)
beta1, beta2 = binarySearchJoint_DoubleLambda(model, X1=X1, y1=y1, X2=X2, y2=y2, select_num=snum, lrScale=1e-3, quiet=quiet)
ind1 = np.where(beta1!=0)[0].tolist()
ind2 = np.where(beta2!=0)[0].tolist()
if len(ind1) + len(ind2) > snum*maxFactor*100:
maxRho = rho
continue
elif len(ind1) == 0 and len(ind2) == 0:
maxRho = rho
continue
else:
m = []
for a in ind1:
if a in ind2:
m.append(a)
if not quiet:
print 'SNPs in common', len(m)
if len(m) < 1:
minRho = rho
continue
else:
Beta1 = beta1
Beta2 = beta2
break
return Beta1, Beta2
fileType = 0
IN = None
if len(args) != 0:
parser.print_help()
sys.exit()
outFile1 = options.fileName1 + '.output'
outFile2 = options.fileName2 + '.output'
print 'Running ... '
reader = FileReader(fileName=options.fileName1, fileType=options.fileType, imputation=(not options.missing))
X1, Y1, Xname1 = reader.readFiles()
reader = FileReader(fileName=options.fileName2, fileType=options.fileType, imputation=(not options.missing))
X2, Y2, Xname2 = reader.readFiles()
X1, Xname1, X2, Xname2 = matchSNPs(X1, Xname1, X2, Xname2)
model = CMM(quiet=options.quiet)
print 'Computation starts ... '
if options.snum is not None: # select for a fix number of variable
snum = float(options.snum)
Beta1, Beta2 = binarySearch_Rho(model, X1, Y1, X2, Y2, snum, quiet=options.quiet)
elif options.lmbd is not None:
lmbd = float(options.lmbd)
model.setLambda(lmbd)
model.setRho(float(options.rho))
model.setLearningRate(learningRate) # learning rate must be set again every time we run it.
model.fit(X1, Y1, X2, Y2)
Beta1 = model.getBeta1()
Beta2 = model.getBeta2()
else:
Beta1, Beta2 = crossValidation(model, X1, Y1, X2, Y2, learningRate)
ind1 = np.where(Beta1 != 0)[0]
bs1 = Beta1[ind1].tolist()
xname1 = []
for i in ind1:
xname1.append(i)
beta_name1 = zip(bs1, xname1)
bn = sorted(beta_name1)
bn.reverse()
out1 = open(outFile1, 'w')
printOutHead(out1)
for i in range(len(bn)):
outputResult(out1, i + 1, bn[i][1], bn[i][0])
out1.close()
ind2 = np.where(Beta2 != 0)[0]
bs2 = Beta2[ind2].tolist()
xname2 = []
for i in ind2:
xname1.append(i)
beta_name2 = zip(bs2, xname2)
bn = sorted(beta_name2)
bn.reverse()
out2 = open(outFile2, 'w')
printOutHead(out2)
for i in range(len(bn)):
outputResult(out2, i + 1, bn[i][1], bn[i][0])
out1.close()
print '\nComputation ends normally, check the output file at ', outFile1, outFile2