Skip to content

Commit 57df481

Browse files
Jonathan Y. HsuJonathan Y. Hsu
authored andcommitted
power estimation optional
1 parent af37ad3 commit 57df481

File tree

5 files changed

+186
-47
lines changed

5 files changed

+186
-47
lines changed

SURF/command_line/CRISPR_SURF_Outputs.py

Lines changed: 84 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -104,7 +104,7 @@ def crispr_surf_sgRNA_summary_table_update(sgRNA_summary_table, gammas2betas, av
104104

105105
f.write(','.join(map(str, row)) + '\n')
106106

107-
def complete_beta_profile(gammas2betas, simulation_n, padj_cutoffs, out_dir):
107+
def complete_beta_profile(gammas2betas, simulation_n, padj_cutoffs, estimate_statistical_power, out_dir):
108108
"""
109109
Function to output total beta profile.
110110
"""
@@ -118,18 +118,32 @@ def complete_beta_profile(gammas2betas, simulation_n, padj_cutoffs, out_dir):
118118
padj_min = np.min([float(x) for x in gammas2betas['padj'] if str(x) != 'nan' and float(x) > 0])
119119
pvals_new = [float(x) if float(x) > 0 else p_min for x in pvals]
120120
pvals_adj_new = [float(x) if float(x) > 0 else padj_min for x in pvals_adj]
121-
power = gammas2betas['power']
121+
122+
if estimate_statistical_power == 'yes':
123+
power = gammas2betas['power']
122124

123-
df = pd.DataFrame({
124-
'Chr': chrom,
125-
'Index': indices,
126-
'Beta': betas,
127-
'Pval.': pvals_new,
128-
'Pval_adj.': pvals_adj_new,
129-
'Statistical_Power': power
130-
})
125+
df = pd.DataFrame({
126+
'Chr': chrom,
127+
'Index': indices,
128+
'Beta': betas,
129+
'Pval.': pvals_new,
130+
'Pval_adj.': pvals_adj_new,
131+
'Statistical_Power': power
132+
})
131133

132-
df.to_csv(path_or_buf = (out_dir + '/beta_profile.csv'), index = False, header = True, columns = ['Chr','Index','Beta','Pval.','Pval_adj.','Statistical_Power'])
134+
df.to_csv(path_or_buf = (out_dir + '/beta_profile.csv'), index = False, header = True, columns = ['Chr','Index','Beta','Pval.','Pval_adj.','Statistical_Power'])
135+
136+
else:
137+
138+
df = pd.DataFrame({
139+
'Chr': chrom,
140+
'Index': indices,
141+
'Beta': betas,
142+
'Pval.': pvals_new,
143+
'Pval_adj.': pvals_adj_new
144+
})
145+
146+
df.to_csv(path_or_buf = (out_dir + '/beta_profile.csv'), index = False, header = True, columns = ['Chr','Index','Beta','Pval.','Pval_adj.'])
133147

134148
def crispr_surf_significant_regions(sgRNA_summary_table, gammas2betas, padj_cutoffs, scale, guideindices2bin, out_dir):
135149

@@ -201,7 +215,7 @@ def crispr_surf_significant_regions(sgRNA_summary_table, gammas2betas, padj_cuto
201215
if len(associated_sgRNAs) > 0:
202216
f.write(','.join(map(str, [padj_cutoff, chrom, genomic_boundary_start, genomic_boundary_stop, significance_direction, signal_area, signal_mean, padj_mean, len(associated_sgRNAs), ','.join(map(str, associated_sgRNAs))])) + '\n')
203217

204-
def crispr_surf_IGV(sgRNA_summary_table, gammas2betas, padj_cutoffs, genome, scale, guideindices2bin, out_dir):
218+
def crispr_surf_IGV(sgRNA_summary_table, gammas2betas, padj_cutoffs, genome, scale, guideindices2bin, estimate_statistical_power, out_dir):
205219

206220
"""
207221
Function to output tracks to load on IGV.
@@ -265,27 +279,70 @@ def crispr_surf_IGV(sgRNA_summary_table, gammas2betas, padj_cutoffs, genome, sca
265279
# Output raw and deconvolved scores IGV track
266280
dff = df_summary_table[pd.notnull(df_summary_table['Chr']) & pd.notnull(df_summary_table['Perturbation_Index']) & pd.notnull(df_summary_table['Raw_Signal_Combined']) & pd.notnull(df_summary_table['Deconvolved_Signal_Combined'])]
267281

268-
with open(out_dir + '/raw_scores.bedgraph', 'w') as raw_scores, open(out_dir + '/deconvolved_scores.bedgraph', 'w') as deconvolved_scores, open(out_dir + '/neglog10_pvals.bedgraph', 'w') as neglog10_pvals, open(out_dir + '/statistical_power.bedgraph', 'w') as statistical_power:
282+
# with open(out_dir + '/raw_scores.bedgraph', 'w') as raw_scores, open(out_dir + '/deconvolved_scores.bedgraph', 'w') as deconvolved_scores, open(out_dir + '/neglog10_pvals.bedgraph', 'w') as neglog10_pvals, open(out_dir + '/statistical_power.bedgraph', 'w') as statistical_power:
269283

270-
for index, row in dff.iterrows():
284+
# for index, row in dff.iterrows():
271285

272-
for r in range(1, replicates + 1):
273-
raw_scores.write('\t'.join(map(str, [row['Chr'], int(row['Perturbation_Index']), int(row['Perturbation_Index']), float(row['Log2FC_Replicate%s' % r]), row['sgRNA_Sequence']])) + '\n')
286+
# for r in range(1, replicates + 1):
287+
# raw_scores.write('\t'.join(map(str, [row['Chr'], int(row['Perturbation_Index']), int(row['Perturbation_Index']), float(row['Log2FC_Replicate%s' % r]), row['sgRNA_Sequence']])) + '\n')
274288

275-
for index in range(len(gammas2betas['indices'])):
289+
# for index in range(len(gammas2betas['indices'])):
276290

277-
if float(gammas2betas['padj'][index]) > 0:
278-
neglog10_pval = -math.log10(float(gammas2betas['padj'][index]))
279-
else:
280-
neglog10_pval = -math.log10(padj_min)
291+
# if float(gammas2betas['padj'][index]) > 0:
292+
# neglog10_pval = -math.log10(float(gammas2betas['padj'][index]))
293+
# else:
294+
# neglog10_pval = -math.log10(padj_min)
295+
296+
# deconvolved_scores.write('\t'.join(map(str, [gammas2betas['chr'][index], int(gammas2betas['indices'][index]), int(gammas2betas['indices'][index]), float(gammas2betas['combined'][index])])) + '\n')
297+
# neglog10_pvals.write('\t'.join(map(str, [gammas2betas['chr'][index], int(gammas2betas['indices'][index]), int(gammas2betas['indices'][index]), neglog10_pval])) + '\n')
298+
# statistical_power.write('\t'.join(map(str, [gammas2betas['chr'][index], int(gammas2betas['indices'][index]), int(gammas2betas['indices'][index]), float(gammas2betas['power'][index])])) + '\n')
299+
300+
if estimate_statistical_power == 'yes':
281301

282-
deconvolved_scores.write('\t'.join(map(str, [gammas2betas['chr'][index], int(gammas2betas['indices'][index]), int(gammas2betas['indices'][index]), float(gammas2betas['combined'][index])])) + '\n')
283-
neglog10_pvals.write('\t'.join(map(str, [gammas2betas['chr'][index], int(gammas2betas['indices'][index]), int(gammas2betas['indices'][index]), neglog10_pval])) + '\n')
284-
statistical_power.write('\t'.join(map(str, [gammas2betas['chr'][index], int(gammas2betas['indices'][index]), int(gammas2betas['indices'][index]), float(gammas2betas['power'][index])])) + '\n')
302+
with open(out_dir + '/raw_scores.bedgraph', 'w') as raw_scores, open(out_dir + '/deconvolved_scores.bedgraph', 'w') as deconvolved_scores, open(out_dir + '/neglog10_pvals.bedgraph', 'w') as neglog10_pvals, open(out_dir + '/statistical_power.bedgraph', 'w') as statistical_power:
303+
304+
for index, row in dff.iterrows():
305+
306+
for r in range(1, replicates + 1):
307+
raw_scores.write('\t'.join(map(str, [row['Chr'], int(row['Perturbation_Index']), int(row['Perturbation_Index']), float(row['Log2FC_Replicate%s' % r]), row['sgRNA_Sequence']])) + '\n')
308+
309+
for index in range(len(gammas2betas['indices'])):
310+
311+
if float(gammas2betas['padj'][index]) > 0:
312+
neglog10_pval = -math.log10(float(gammas2betas['padj'][index]))
313+
else:
314+
neglog10_pval = -math.log10(padj_min)
285315

286-
# Create IGV session
287-
with open('/SURF/igv_session_template.xml', 'r') as f:
288-
igv_template = f.read()
316+
deconvolved_scores.write('\t'.join(map(str, [gammas2betas['chr'][index], int(gammas2betas['indices'][index]), int(gammas2betas['indices'][index]), float(gammas2betas['combined'][index])])) + '\n')
317+
neglog10_pvals.write('\t'.join(map(str, [gammas2betas['chr'][index], int(gammas2betas['indices'][index]), int(gammas2betas['indices'][index]), neglog10_pval])) + '\n')
318+
statistical_power.write('\t'.join(map(str, [gammas2betas['chr'][index], int(gammas2betas['indices'][index]), int(gammas2betas['indices'][index]), float(gammas2betas['power'][index])])) + '\n')
319+
320+
# Create IGV session
321+
with open('/SURF/igv_session_template.xml', 'r') as f:
322+
igv_template = f.read()
323+
324+
else:
325+
326+
with open(out_dir + '/raw_scores.bedgraph', 'w') as raw_scores, open(out_dir + '/deconvolved_scores.bedgraph', 'w') as deconvolved_scores, open(out_dir + '/neglog10_pvals.bedgraph', 'w') as neglog10_pvals:
327+
328+
for index, row in dff.iterrows():
329+
330+
for r in range(1, replicates + 1):
331+
raw_scores.write('\t'.join(map(str, [row['Chr'], int(row['Perturbation_Index']), int(row['Perturbation_Index']), float(row['Log2FC_Replicate%s' % r]), row['sgRNA_Sequence']])) + '\n')
332+
333+
for index in range(len(gammas2betas['indices'])):
334+
335+
if float(gammas2betas['padj'][index]) > 0:
336+
neglog10_pval = -math.log10(float(gammas2betas['padj'][index]))
337+
else:
338+
neglog10_pval = -math.log10(padj_min)
339+
340+
deconvolved_scores.write('\t'.join(map(str, [gammas2betas['chr'][index], int(gammas2betas['indices'][index]), int(gammas2betas['indices'][index]), float(gammas2betas['combined'][index])])) + '\n')
341+
neglog10_pvals.write('\t'.join(map(str, [gammas2betas['chr'][index], int(gammas2betas['indices'][index]), int(gammas2betas['indices'][index]), neglog10_pval])) + '\n')
342+
343+
# Create IGV session
344+
with open('/SURF/igv_session_template_nopower.xml', 'r') as f:
345+
igv_template = f.read()
289346

290347
igv_template = igv_template.replace('#genome#', str(genome))
291348

SURF/command_line/CRISPR_SURF_Statistical_Significance.py

Lines changed: 18 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,7 @@ def crispr_surf_deconvolved_signal(gammas2betas, gamma_chosen, averaging_method,
6060

6161
return gammas2betas
6262

63-
def crispr_surf_statistical_significance(sgRNA_summary_table, sgRNA_indices, perturbation_profile, gammas2betas, null_distribution, simulation_n, test_type, guideindices2bin, averaging_method, padj_cutoffs, effect_size, limit, scale):
63+
def crispr_surf_statistical_significance(sgRNA_summary_table, sgRNA_indices, perturbation_profile, gammas2betas, null_distribution, simulation_n, test_type, guideindices2bin, averaging_method, padj_cutoffs, effect_size, limit, scale, estimate_statistical_power):
6464

6565
"""
6666
Function to assess the statistical significance of deconvolved genomic signal.
@@ -201,29 +201,30 @@ def crispr_surf_statistical_significance(sgRNA_summary_table, sgRNA_indices, per
201201
new_p_cutoff = beta_pvals[pymin(range(len(beta_pvals_adj)), key=lambda i: pyabs(beta_pvals_adj[i] - float(padj_cutoffs[0])))]
202202

203203
# Estimate statistical power
204-
beta_statistical_power = []
205-
if scale > 1:
206-
beta_corrected_effect_size = crispr_surf_statistical_power(sgRNA_indices = guideindices2bin.keys(), gammas2betas = gammas2betas, effect_size = effect_size, gamma_chosen = gamma_chosen, perturbation_profile = perturbation_profile, scale = scale)
204+
if estimate_statistical_power == 'yes':
205+
beta_statistical_power = []
206+
if scale > 1:
207+
beta_corrected_effect_size = crispr_surf_statistical_power(sgRNA_indices = guideindices2bin.keys(), gammas2betas = gammas2betas, effect_size = effect_size, gamma_chosen = gamma_chosen, perturbation_profile = perturbation_profile, scale = scale)
207208

208-
else:
209-
beta_corrected_effect_size = crispr_surf_statistical_power(sgRNA_indices = sgRNA_indices, gammas2betas = gammas2betas, effect_size = effect_size, gamma_chosen = gamma_chosen, perturbation_profile = perturbation_profile, scale = scale)
209+
else:
210+
beta_corrected_effect_size = crispr_surf_statistical_power(sgRNA_indices = sgRNA_indices, gammas2betas = gammas2betas, effect_size = effect_size, gamma_chosen = gamma_chosen, perturbation_profile = perturbation_profile, scale = scale)
210211

211-
for i in range(len(beta_corrected_effect_size)):
212+
for i in range(len(beta_corrected_effect_size)):
212213

213-
# shifted_distribution = [x + beta_corrected_effect_size[i] for x in beta_distributions_null[i]]
214-
# percentile_cutoff = np.percentile(beta_distributions_null[i], (100.0 - float(new_p_cutoff)*100.0/2.0))
214+
# shifted_distribution = [x + beta_corrected_effect_size[i] for x in beta_distributions_null[i]]
215+
# percentile_cutoff = np.percentile(beta_distributions_null[i], (100.0 - float(new_p_cutoff)*100.0/2.0))
215216

216-
beta_dist_null = np.array(beta_distributions_null[i])
217-
shifted_distribution = beta_dist_null + beta_corrected_effect_size[i]
218-
percentile_cutoff = np.percentile(beta_dist_null, (100.0 - float(new_p_cutoff)*100.0/2.0))
217+
beta_dist_null = np.array(beta_distributions_null[i])
218+
shifted_distribution = beta_dist_null + beta_corrected_effect_size[i]
219+
percentile_cutoff = np.percentile(beta_dist_null, (100.0 - float(new_p_cutoff)*100.0/2.0))
219220

220-
if (i + 1)%500 == 0:
221-
logger.info('Calculated statistical power for %s out of %s betas ...' % ((i + 1), len(beta_distributions)))
221+
if (i + 1)%500 == 0:
222+
logger.info('Calculated statistical power for %s out of %s betas ...' % ((i + 1), len(beta_distributions)))
222223

223-
# beta_statistical_power.append(float(sum(x >= percentile_cutoff for x in shifted_distribution))/float(len(shifted_distribution)))
224+
# beta_statistical_power.append(float(sum(x >= percentile_cutoff for x in shifted_distribution))/float(len(shifted_distribution)))
224225

225-
beta_statistical_power.append((shifted_distribution > percentile_cutoff).sum() / float(len(shifted_distribution)))
226+
beta_statistical_power.append((shifted_distribution > percentile_cutoff).sum() / float(len(shifted_distribution)))
226227

227-
gammas2betas['power'] = beta_statistical_power
228+
gammas2betas['power'] = beta_statistical_power
228229

229230
return gammas2betas, replicate_parameters

SURF/command_line/SURF_deconvolution.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@
3636
parser.add_argument('-corr', '--correlation', type = float, default = 0, help = 'The correlation between biological replicates to determine a reasonable lambda for the deconvolution operation. if 0 (default), the --characteristic_perturbation_range argument will be used to set an appropriate correlation.')
3737
parser.add_argument('-genome', '--genome', type = str, default = 'hg19', help = 'The genome to be used to create the IGV session file (hg19, hg38, mm9, mm10, etc.).')
3838
parser.add_argument('-effect_size', '--effect_size', type = float, default = 1, help = 'Effect size to estimate statistical power.')
39+
parser.add_argument('-estimate_power', '--estimate_statistical_power', type = str, choices = ['yes', 'no'], default = 'no', help = 'Whether or not to compute a track estimating statistical power for the CRISPR tiling screen data.')
3940
parser.add_argument('-padjs', '--padj_cutoffs', default = 0, nargs = '+', help = 'List of p-adj. (Benjamini-Hochberg) cut-offs for determining significance of regulatory regions in the CRISPR tiling screen.')
4041
parser.add_argument('-out_dir', '--out_directory', type = str, default = 'CRISPR_SURF_Analysis_%s' % (timestr), help = 'The name of the output directory to place CRISPR-SURF analysis files.')
4142
args = parser.parse_args()
@@ -56,6 +57,7 @@
5657
genome = args.genome
5758
padj_cutoffs = args.padj_cutoffs
5859
effect_size = args.effect_size
60+
estimate_statistical_power = args.estimate_statistical_power
5961
out_dir = args.out_directory
6062

6163
##### Create output directory
@@ -260,7 +262,7 @@
260262

261263
##### Bootstrap deconvolution analysis to assign statistical significance
262264
try:
263-
gammas2betas_updated, replicate_parameters = crispr_surf_statistical_significance(sgRNA_summary_table = sgRNAs_summary_table, sgRNA_indices = sgRNA_indices, perturbation_profile = perturbation_profile, gammas2betas = gammas2betas_updated, null_distribution = null_distribution, simulation_n = simulation_n, test_type = test_type, guideindices2bin = guideindices2bin, averaging_method = averaging_method, padj_cutoffs = padj_cutoffs, effect_size = effect_size, limit = limit, scale = scale)
265+
gammas2betas_updated, replicate_parameters = crispr_surf_statistical_significance(sgRNA_summary_table = sgRNAs_summary_table, sgRNA_indices = sgRNA_indices, perturbation_profile = perturbation_profile, gammas2betas = gammas2betas_updated, null_distribution = null_distribution, simulation_n = simulation_n, test_type = test_type, guideindices2bin = guideindices2bin, averaging_method = averaging_method, padj_cutoffs = padj_cutoffs, effect_size = effect_size, limit = limit, scale = scale, estimate_statistical_power = estimate_statistical_power)
264266
logger.info('Finished simulations to assess statistical significance of deconvolution profile ...')
265267

266268
except:
@@ -278,7 +280,7 @@
278280

279281
##### Output beta profile
280282
try:
281-
complete_beta_profile(gammas2betas = gammas2betas_updated, simulation_n = simulation_n, padj_cutoffs = padj_cutoffs, out_dir = out_dir)
283+
complete_beta_profile(gammas2betas = gammas2betas_updated, simulation_n = simulation_n, padj_cutoffs = padj_cutoffs, estimate_statistical_power = estimate_statistical_power, out_dir = out_dir)
282284
logger.info('Successfully created beta profile file ...')
283285

284286
except:
@@ -296,7 +298,7 @@
296298

297299
##### Output IGV tracks
298300
try:
299-
crispr_surf_IGV(sgRNA_summary_table = sgRNAs_summary_table.split('/')[-1].replace('.csv', '_updated.csv'), gammas2betas = gammas2betas_updated, padj_cutoffs = padj_cutoffs, genome = genome, scale = scale, guideindices2bin = guideindices2bin, out_dir = out_dir)
301+
crispr_surf_IGV(sgRNA_summary_table = sgRNAs_summary_table.split('/')[-1].replace('.csv', '_updated.csv'), gammas2betas = gammas2betas_updated, padj_cutoffs = padj_cutoffs, genome = genome, scale = scale, guideindices2bin = guideindices2bin, estimate_statistical_power = estimate_statistical_power, out_dir = out_dir)
300302
logger.info('Successfully created IGV tracks ...')
301303

302304
except:
@@ -321,6 +323,7 @@
321323
'correlation': correlation,
322324
'genome': genome,
323325
'effect_size': effect_size,
326+
'estimate_statistical_power': estimate_statistical_power,
324327
'padj_cutoffs': ' '.join(map(str, padj_cutoffs)),
325328
'out_directory': out_dir
326329
}

0 commit comments

Comments
 (0)