Updating the way we compute weights.

shz9 · shz9 · commit beca755c7365 · 2020-08-12T18:30:05.000-04:00
diff --git a/gamma_glm_model.py b/gamma_glm_model.py
@@ -299,16 +299,15 @@ def fit_ldscore_model(ld_df, ld_col_names, w_col_name=None,
 
 
 def get_model_lrt(coef, est_intercept,
-                  ld_df, ld_col_names, w_col_name,
+                  ld_df, ld_col_names, ld_weights,
                   chisq_col='CHISQ',
                   null_fit_intercept=False):
 
     X = ld_df.loc[:, ['N'] + ld_col_names].values
     X[:, 0] = 1.
 
     y = np.fmax(1e-6, ld_df[chisq_col])
-
-    u = np.maximum(ld_df[w_col_name].values, 1.)
+    u = 1./ld_weights
 
     fitted_tau = np.append([est_intercept], coef)
     fitted_ll = gamma_loglik(fitted_tau, X, y, u=u)
diff --git a/perform_regression.py b/perform_regression.py
@@ -382,8 +382,11 @@ def perform_ldsc_regression(ld_scores,
             'Coefficients': list(zip(ld_score_names, reg.coef))
         }
 
+        ld_weighs = np.sqrt(np.maximum(nss_df[ldc['WeightCol']].values, 1.))
+        ld_weighs /= float(np.sum(ld_weighs))
+
         ldc['Regression']['LRT'] = get_model_lrt(reg.coef, reg.intercept,
-                                                 nss_df, ld_score_names, ldc['WeightCol'])
+                                                 nss_df, ld_score_names, ld_weighs)
         ldc['Regression']['LRT_se'] = 0.0
 
         pred_chi2 = predict_chi2(reg.coef, reg.intercept,
@@ -392,7 +395,7 @@ def perform_ldsc_regression(ld_scores,
         ldc['Regression']['Predictive Performance'] = {
             'Overall': compute_prediction_metrics(pred_chi2,
                                                   nss_df['CHISQ'].values,
-                                                  nss_df[ldc['WeightCol']].values),
+                                                  1./ld_weighs),
             'Per MAF bin': {}
         }
 
@@ -401,7 +404,7 @@ def perform_ldsc_regression(ld_scores,
             ldc['Regression']['Predictive Performance']['Per MAF bin'][i] = compute_prediction_metrics(
                 pred_chi2[maf_subset],
                 nss_df.loc[maf_subset, 'CHISQ'].values,
-                nss_df.loc[maf_subset, ldc['WeightCol']].values
+                1./ld_weighs[maf_subset]
             )
 
         if ldc['Annotation']:
@@ -473,7 +476,7 @@ def perform_ldsc_regression(ld_scores,
                 ldc['Regression']['Annotations']['Predictive Performance'][an] = {
                     'Overall': compute_prediction_metrics(pred_chi2[ann_subset],
                                                           nss_df.loc[ann_subset, 'CHISQ'].values,
-                                                          nss_df.loc[ann_subset, ldc['WeightCol']].values),
+                                                          1. / ld_weighs[ann_subset]),
                     'Per MAF bin': {}
                 }
 
@@ -482,7 +485,7 @@ def perform_ldsc_regression(ld_scores,
                     ldc['Regression']['Annotations']['Predictive Performance'][an]['Per MAF bin'][i] = compute_prediction_metrics(
                         pred_chi2[ann_subset & maf_subset],
                         nss_df.loc[ann_subset & maf_subset, 'CHISQ'].values,
-                        nss_df.loc[ann_subset & maf_subset, ldc['WeightCol']].values
+                        1. / ld_weighs[ann_subset & maf_subset]
                     )
 
         write_pbz2(os.path.join(output_dir, f"{ldc['Name']}.pbz2"),
diff --git a/predictive_performance_analysis.py b/predictive_performance_analysis.py
@@ -31,7 +31,12 @@
         'S-R2_1.0': '#F28E2B'
     }
 
-methods = ['S-D2_0.0', 'S-D2_0.25', 'S-D2_0.5', 'S-D2_0.75', 'S-D2_1.0']
+partitioned = False
+methods = ['R2_0.0', 'R2_0.25', 'R2_0.5', 'R2_0.75', 'R2_1.0']
+
+if partitioned:
+    methods = ['S-' + m for m in methods]
+
 metrics = [
     'Mean Difference',
     'Weighted Mean Difference',
@@ -44,7 +49,7 @@
 
 annot_res = []
 global_res = []
-avg_chi2 = []
+all_snps_chi2 = []
 
 for trait_file in glob.glob("results/regression/EUR/M_5_50_chi2filt/*/*.pbz2"):
     trait_res = read_pbz2(trait_file)
@@ -63,31 +68,34 @@
                         'Method': m
                     })
 
-            avg_chi2.append({
-                'Trait': trait_name,
-                'Score': trait_res['Predictive Performance']['Overall']['Mean Predicted Chisq'],
-                'Method': m
-            })
-
-            for ann, ann_res in trait_res['Annotations']['Predictive Performance'].items():
-                for mbin, mbin_res in ann_res['Per MAF bin'].items():
-                    for metric in metrics:
-
-                        annot_res.append({
-                            'Annotation': ann,
-                            'Trait': trait_name,
-                            'MAFbin': mbin,
-                            'Metric': metric,
-                            'Score': mbin_res[metric],
-                            'Method': m
-                        })
+            for metric in metrics + ['Mean Predicted Chisq']:
+                all_snps_chi2.append({
+                    'Trait': trait_name,
+                    'Metric': metric,
+                    'Score': trait_res['Predictive Performance']['Overall'][metric],
+                    'Method': m
+                })
+
+            if partitioned:
+                for ann, ann_res in trait_res['Annotations']['Predictive Performance'].items():
+                    for mbin, mbin_res in ann_res['Per MAF bin'].items():
+                        for metric in metrics:
+
+                            annot_res.append({
+                                'Annotation': ann,
+                                'Trait': trait_name,
+                                'MAFbin': mbin,
+                                'Metric': metric,
+                                'Score': mbin_res[metric],
+                                'Method': m
+                            })
 
 annot_res = pd.DataFrame(annot_res)
 global_res = pd.DataFrame(global_res)
-avg_chi2 = pd.DataFrame(avg_chi2)
+all_snps_chi2 = pd.DataFrame(all_snps_chi2)
 
 print(f'Average {metric} across all traits and SNP categories:')
-print(avg_chi2.groupby('Method').mean())
+print(all_snps_chi2.groupby(['Method', 'Metric']).mean())
 
 print('= = = = = = =')
 
@@ -107,36 +115,37 @@
     plt.savefig(f"figures/analysis/global/{metric}{fig_format}")
     plt.close()
 
-    plt.subplots(figsize=(10, 8))
-    sns.barplot(x='MAFbin', y='Score', hue='Method',
-                data=annot_res.loc[annot_res['Metric'] == metric], ci=None,
-                hue_order=methods,
-                palette=ld_scores_colors)
-    plt.xlabel('MAF Decile bin')
-    plt.ylabel(metric)
-    plt.savefig(f"figures/analysis/annotation/{metric}{fig_format}")
-    plt.close()
-
-    highly_enriched_cats = [
-        'Coding_UCSC',
-        'Conserved_LindbladToh',
-        'GERP.RSsup4',
-        'synonymous',
-        'Conserved_Vertebrate_phastCons46way',
-        'Conserved_Mammal_phastCons46way',
-        'Conserved_Primate_phastCons46way',
-        'BivFlnk',
-        'Ancient_Sequence_Age_Human_Promoter',
-        'Human_Promoter_Villar_ExAC'
-    ]
-
-    plt.subplots(figsize=(10, 8))
-    sns.barplot(x='MAFbin', y='Score', hue='Method',
-                hue_order=methods,
-                data=annot_res.loc[annot_res['Annotation'].isin(highly_enriched_cats) &
-                                   (annot_res['Metric'] == metric)], ci=None,
-                palette=ld_scores_colors)
-    plt.xlabel('MAF Decile bin')
-    plt.ylabel(metric)
-    plt.savefig(f"figures/analysis/highly_enriched_annotation/{metric}{fig_format}")
-    plt.close()
+    if partitioned:
+        plt.subplots(figsize=(10, 8))
+        sns.barplot(x='MAFbin', y='Score', hue='Method',
+                    data=annot_res.loc[annot_res['Metric'] == metric], ci=None,
+                    hue_order=methods,
+                    palette=ld_scores_colors)
+        plt.xlabel('MAF Decile bin')
+        plt.ylabel(metric)
+        plt.savefig(f"figures/analysis/annotation/{metric}{fig_format}")
+        plt.close()
+
+        highly_enriched_cats = [
+            'Coding_UCSC',
+            'Conserved_LindbladToh',
+            'GERP.RSsup4',
+            'synonymous',
+            'Conserved_Vertebrate_phastCons46way',
+            'Conserved_Mammal_phastCons46way',
+            'Conserved_Primate_phastCons46way',
+            'BivFlnk',
+            'Ancient_Sequence_Age_Human_Promoter',
+            'Human_Promoter_Villar_ExAC'
+        ]
+
+        plt.subplots(figsize=(10, 8))
+        sns.barplot(x='MAFbin', y='Score', hue='Method',
+                    hue_order=methods,
+                    data=annot_res.loc[annot_res['Annotation'].isin(highly_enriched_cats) &
+                                       (annot_res['Metric'] == metric)], ci=None,
+                    palette=ld_scores_colors)
+        plt.xlabel('MAF Decile bin')
+        plt.ylabel(metric)
+        plt.savefig(f"figures/analysis/highly_enriched_annotation/{metric}{fig_format}")
+        plt.close()