-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathbayescce.m
217 lines (191 loc) · 10.3 KB
/
bayescce.m
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
%%% bayescce.m
%%%
%%% Summary: A function to estimate cell counts using DNA methylation data from heterogeneous tissue and a prior information on the cell type distribution in the studied tissue.
%%% This is an implementation of the BayesCCE method (Bayesian Cell Count Estimation). The algorithm is described in Rahmani et al. 2018: A Bayesian Framework for Estimating Cell Type Composition from DNA Methylation Without the Need for Methylation Reference.
%%% Author: Elior Rahmani ([email protected])
%%% Date: 25th January 2017 (latest update: Feb 10th 2018)
%%%
%%% This function was implemented and tested using Matlab 2015b.
%
% INPUT:
% (1) X - an m by n matrix of sites by samples methylation levels.
% For data preparation, it is advised to follow the recommendations for applying ReFACTor (Rahmani et al., Nature Methods 2016): exclude sites with extremely low variability and exclude polymorphic and cross-reactive sites as well sites coming from the sex chromosomes.
% The full set of recommendations for data preparations for ReFACTor can be found under "Tissue heterogeneity" at: glint-epigenetics.readthedocs.io
% (2) model_covars - an n by p matrix of p covariates for the n individuals.
% model_covars should include covariates that may have large effects on many methylation sites but are not expected to be correlated with the cell type composition (e.g., batch information).
% (3) refactor_covars - an n by p' matrix of p' covariates for the n individuals. Note that p' may be different than p (the number of covariates in model_covars).
% These covariates will be accounted for while calculating the ReFACTor components. Any covariate that is expected to have large effects on many methylation sites should be included, including covariates that may be correlated with the cell type composition (e.g., age and sex).
% (4) k_refactor - the parameter k for the calculation of the ReFACTor components (the number of assumed cell types; for more information see the documentation of ReFACTor at: glint-epigenetics.readthedocs.io).
% (5) d - the number of ReFACTor components to use (typically can be the same as k_refactor, however, when the cell type composition information spans over more PCs, increasing d is expected to improve the resutls).
% (6) t - the number of sites to use in the optimization (the top t sites selected by ReFACTor are considered; for more information see the documentation of ReFACTor at: glint-epigenetics.readthedocs.io).
% (7) alpha - a column vector with k values (the number of assumed cell types), each corresponding to one parameter of the Dirichlet prior on the cell proportions. Note that k=length(alpha) (i.e. the number of assumed cell types)
% Alternatively, alpha can be a k times n matrix, in which case each individual gets their own prior (can be used, for example, for providing different priors to different populations in the sample such as cases and controls).
%
% BayesCCE impute (OPTIONAL; provide cell counts for some of the samples):
% (8) R_reference (optional) - an n0 by k matrix of cell type proportions
% for n0 refernece individuals. Provide an empty vector if does not apply (i.e. []).
% (9) reference_indices (optional) - an n0 by 1 vector with indices of the
% reference samples in X. Provide an empty vector if does not apply (i.e. []).
%
% OUTPUT:
% (1) R_est - an n by k matrix of cell composition estimates. Each column is expected to correspond to one cell type.
% Note that the i-th column is not guaranteed to correspond to the i-th cell type as indicated by alpha, unless BayesCCE impute is used (i.e. the R_reference and reference_indices are provided).
% If BayesCCE impute is used then each estimated colum is also expected to match its corresponding cell type in scale (i.e. provide estimates in absolute values).
% Note that if R_reference is given then R_est(reference_indices,:) contain R_reference.
% (2) M_est - a t by k matrix of the estimated cell-type specific mean levels in each of the t selected sites.
% (3) beta_est - t by p matrix of the estimated effects of the model covariates on on each of the t selected sites (empy if no covariates were provided).
% (4) sites - the indices of the sites in X that were used in the optimization.
function [R_est,M_est,beta_est,sites] = bayescce(X,model_covars,refactor_covars,k_refactor,d,t,alpha,R_reference,reference_indices)
epsilon = 0.0001; % the minimal value for R_{ih} (should be greater than zero to allow evaluation of the log(R_{ih}) values in the objective).
epsilon_sum = 0.05; % the sum of each sample's cell proportions will be in the range [1-epsilon_sum,1+epsilon_sum].
epsilon_ref = 0.01; % cell counts (if R_reference is provided) may include measurement errors; R_{ih} will be constrained to the range [R_{ih}*(1-epsilon_ref],R_{ih}*(1+epsilon_ref)].
% Settings for the optimization
fmincon_options = optimset('fmincon');
fmincon_options = optimset('GradObj','on','Display','iter','MaxIter',5000,'MaxFunEvals',30000);
fmincon_options_given_M = optimset('GradObj','on','Display','iter','MaxIter',5000,'MaxFunEvals',30000);
if ~exist('R_reference','var')
R_reference = [];
reference_indices = [];
else
assert (size(R_reference,1) == length(reference_indices));
non_reference_indices = setdiff(1:size(X,2),reference_indices);
end
% Compute the ReFACTor components
X_adj = X;
if (~isempty(refactor_covars))
X_adj = adjust_data(X',refactor_covars)';
end
[~,ranked_list] = refactor(X_adj,k_refactor,t,k_refactor);
sites = ranked_list(1:t);
O = X(sites,:);
if (size(alpha,2) == 1)
alpha = repmat(alpha,1,size(O,2));
end
assert (size(alpha,2) == size(O,2));
% Set the dimensions of the problem
m = t;
n = size(O,2);
n0 = length(reference_indices);
n1 = n-n0;
C = model_covars;
p = size(C,2);
k = size(alpha,1);
M_est = [];
if (~isempty(R_reference))
% There are cell counts for a subset of the samples; estimate M, the cell type specific methylation mean of each site in O
M_est = zeros(size(O,1),k);
options_lsqlin = optimset('lsqlin');
options_lsqlin = optimset(options_lsqlin,'Display','none');
% foreach 1<=i<=m: M_{ih} in [0,1]
lb = zeros(k,1);
ub = ones(k,1);
for i = 1:m
M_est(i,:) = lsqlin(R_reference,O(i,reference_indices)',[],[],[],[],lb,ub,[],options_lsqlin);
end
end
% Compute the loadings and scores of the PCs of O (these are the ReFACTor components)
[L,P] = pca(zscore(O'));
L = [ones(m,1) L(:,1:(d))];
P = [ones(n,1) P(:,1:(d))];
d = d + 1;
% Set an initial point that is feasible under the constraints
xx = [mean(alpha')/n]';
s = xx'/sum(xx);
if (isempty(M_est))
x0 = zeros(2*d*k+d*p,1);
for h = 1:k
x0(d*k+1+(h-1)*d) = s(h);
end
else
x0 = zeros(d*k+d*p,1);
for h = 1:k
x0(1+(h-1)*d) = s(h);
end
end
% Formulate the constraints of the problem
A_eq = [];
b_eq = [];
if (isempty(M_est))
% No reference cell counts were provided
A_ineq = zeros(k*(n+2*m)+n*2,length(x0));
b_ineq = zeros(k*(n+2*m)+n*2,1);
for h = 1:k
% foreach i: foreach h: R_{ih} >= 0
A_ineq((h-1)*n+1:h*n , d*k+1+(h-1)*d:d*k+h*d) = -P;
b_ineq((h-1)*n+1:h*n) = epsilon;
% foreach j: foreach h: M_{jk} is in the range [0,1]
A_ineq((k+2)*n+1+(h-1)*m:(k+2)*n+h*m, (h-1)*d+1:h*d) = -L;
A_ineq((k+2)*n+m*k+1+(h-1)*m:(k+2)*n+m*k+h*m, (h-1)*d+1:h*d) = L;
b_ineq((k+2)*n+1+(h-1)*m:(k+2)*n+h*m) = 0;
b_ineq((k+2)*n+m*k+1+(h-1)*m:(k+2)*n+m*k+h*m) = 1;
end
% foreach i: sum_{h=1}^k R_{ih} is in the range [1-epsilon,1+epsilon]
A_ineq(k*n+1:(k+1)*n , d*k+1:2*k*d) = repmat(P,1,k);
A_ineq((k+1)*n+1:(k+2)*n , d*k+1:2*k*d) = repmat(-P,1,k);
b_ineq(k*n+1:(k+1)*n) = 1+epsilon_sum;
b_ineq((k+1)*n+1:(k+2)*n) = -(1-epsilon_sum);
else
% Cell counts were provided for some of the samples; use M_est
A_ineq = zeros(n1*k+2*n+n0*2*k,length(x0));
b_ineq = zeros(n1*k+2*n+n0*2*k,1);
for h = 1:k
% foreach i in n1 (non-reference): foreach h: R_{ih} >= 0
A_ineq((h-1)*n1+1:h*n1 , 1+(h-1)*d:h*d) = -P(non_reference_indices,:);
b_ineq((h-1)*n1+1:h*n1) = epsilon;
% foreach i in n0 (reference): foreach h: R_{ih} in [R_{ih}*(1-epsilon_ref],R_{ih}*(1+epsilon_ref)]
A_ineq(n1*k+(h-1)*n0+1:n1*k+h*n0 , 1+(h-1)*d:h*d) = P(reference_indices,:);
A_ineq(n1*k+n0*k+(h-1)*n0+1:n1*k+n0*k+h*n0 , 1+(h-1)*d:h*d) = -P(reference_indices,:);
b_ineq(n1*k+(h-1)*n0+1:n1*k+h*n0) = R_reference(:,h)*(1+epsilon_ref);
b_ineq(n1*k+n0*k+(h-1)*n0+1:n1*k+n0*k+h*n0) = -R_reference(:,h)*(1-epsilon_ref);
end
% foreach i: sum_{h=1}^k R_{ih} is in the range [1-epsilon,1+epsilon]
A_ineq(k*(n0*2+n1)+1:k*(n0*2+n1)+n , 1:k*d) = repmat(P,1,k);
A_ineq(k*(n0*2+n1)+n+1:k*(n0*2+n1)+n*2 , 1:k*d) = repmat(-P,1,k);
b_ineq(k*(n0*2+n1)+1:k*(n0*2+n1)+n) = 1+epsilon_sum;
b_ineq(k*(n0*2+n1)+n+1:k*(n0*2+n1)+n*2) = -(1-epsilon_sum);
end
% Estimate sigma^2
sse = 0;
for i = 1:m
[~,~,res] = regress(O(i,:)',[P C]);
sse = sse + sum(res.^2);
end
sigma2_hat = sse./(n*m);
if (isempty(M_est))
if (p ==0)
% No covaraites were provided
f = @(X) ecc_obj(O,L,P,reshape(X(1:(k*d)),d,k),reshape(X(k*d+1:2*k*d),d,k),reshape(X(2*k*d+1:end),d,p),alpha,sigma2_hat);
else
f = @(X) ecc_obj_covars(O,L,P,C,reshape(X(1:(k*d)),d,k),reshape(X(k*d+1:2*k*d),d,k),reshape(X(2*k*d+1:end),d,p),alpha,sigma2_hat);
end
[x,fval,exitflag,output,lamb,grad] = fmincon(f,x0,A_ineq,b_ineq,A_eq,b_eq,[],[],[],fmincon_options);
else
if (p ==0)
% No covaraites were provided
f = @(X) ecc_obj_given_M(O,L,P,M_est,reshape(X(1:(k*d)),d,k),reshape(X(k*d+1:k*d+p*d),d,p),alpha,sigma2_hat);
else
f = @(X) ecc_obj_given_M_covars(O,L,P,M_est,reshape(X(1:(k*d)),d,k),reshape(X(k*d+1:k*d+p*d),d,p),alpha,sigma2_hat,model_covars);
end
[x,fval,exitflag,output,lamb,grad] = fmincon(f,x0,A_ineq,b_ineq,A_eq,b_eq,[],[],[],fmincon_options_given_M );
end
beta_est = [];
if (isempty(M_est))
A = reshape(x(1:d*k),d,k);
V = reshape(x(d*k+1:2*k*d),d,k);
B = reshape(x(2*k*d+1:end),d,p);
R_est = [V'*P']';
M_est = L*A;
if (~isempty(B))
beta_est = L*B;
end
else
V = reshape(x(1:k*d),d,k);
B = reshape(x(k*d+1:end),d,p);
R_est = [V'*P']';
% Use the known cell counts for the reference samples
R_est(reference_indices,:) = R_reference;
if (~isempty(B))
beta_est = L*B;
end
end
end