MR-link-2 for R in working condition. Not as featureful as the python…

… version, but will produce the same results
adriaan-vd-graaf · Oct 7, 2024 · ae3eb0b · ae3eb0b
1 parent 73fb48d
commit ae3eb0b
Show file tree

Hide file tree

Showing 3 changed files with 164 additions and 8 deletions.
diff --git a/R/mr_link_2_functions.R b/R/mr_link_2_functions.R
@@ -81,7 +81,7 @@ mr_link2 <- function(selected_eigenvalues, selected_eigenvectors,
 
   # Define optimization options
   method <- "Nelder-Mead"
-  control <- list(maxit = 100000)
+  control <- list(maxit = 300)
 
   # Calculate c_x and c_y
   c_x <- t(selected_eigenvectors) %*% exposure_betas
@@ -207,4 +207,83 @@ mr_link2 <- function(selected_eigenvalues, selected_eigenvectors,
     optim_ha_nit = ha_results$counts,
     function_time = as.numeric(difftime(Sys.time(), start_time, units = "secs"))
   )
+}
+
+remove_highly_correlated <- function(ld_matrix, snp_ordering, max_correlation) {
+  # Remove NAs from the correlation matrix
+  ld_matrix[is.na(ld_matrix)] <- 0
+
+  # Initialize a logical vector to mark rows/columns to remove
+  idxs_to_remove <- rep(FALSE, nrow(ld_matrix))
+
+  # Find pairs of SNPs that are highly correlated
+  correlated_indices <- which(abs(ld_matrix) >= max_correlation, arr.ind = TRUE)
+
+  to_remove <- vector("logical", length = nrow(ld_matrix))
+
+  # Loop through correlated pairs and mark SNPs to remove
+  for (i in seq_len(nrow(correlated_indices))) {
+    a <- correlated_indices[i, 1]
+    b <- correlated_indices[i, 2]
+
+    if ((!to_remove[a] && !to_remove[b] ) && (a != b)) {
+      to_remove[b] <- TRUE
+    }
+  }
+
+  # Remove highly correlated SNPs from both the matrix and the SNP list
+  ld_matrix <- ld_matrix[!to_remove, !to_remove]
+  pruned_snps <- snp_ordering[!to_remove]
+
+  return(list(ld_matrix = ld_matrix, pruned_snps = pruned_snps))
+}
+
+mr_link2_analysis <- function(exposure_betas, outcome_betas, ld_matrix, n_exp, n_out, max_correlation = 0.99) {
+  # Prune highly correlated SNPs
+  snp_ordering <- seq_len(nrow(ld_matrix))  # Simple numbering for SNPs
+  pruned_data <- remove_highly_correlated(ld_matrix, snp_ordering, max_correlation)
+  pruned_ld_matrix <- pruned_data$ld_matrix
+  pruned_snps <- pruned_data$pruned_snps
+
+  # Eigenvalue decomposition
+  eigen_decomp <- eigen(pruned_ld_matrix)
+  eigenvalues <- eigen_decomp$values
+  eigenvectors <- eigen_decomp$vectors
+
+  # Calculate cumulative variance explained
+  variance_explained <- cumsum(eigenvalues) / sum(eigenvalues)
+
+  # Select the eigenvectors that explain at least 99% of the variance
+  threshold_index <- min(which(variance_explained >= 0.99))
+  selected_eigenvectors <- eigenvectors[, 1:threshold_index]
+  selected_eigenvalues <- eigenvalues[1:threshold_index]
+
+  # Call the existing MR-link-2 function
+  # Assuming `mr_link2` is already defined in R and takes the following arguments:
+  # - selected_eigenvalues
+  # - selected_eigenvectors
+  # - exposure_betas
+  # - outcome_betas
+  # - n_exp (number of individuals in exposure dataset)
+  # - n_out (number of individuals in outcome dataset)
+  sigma_exp_guess <- 0.01  # You can adjust or estimate this as needed
+  sigma_out_guess <- 0.001  # You can adjust or estimate this as needed
+
+  # Subset the betas based on pruned SNPs
+  exposure_betas_pruned <- exposure_betas[pruned_snps]
+  outcome_betas_pruned <- outcome_betas[pruned_snps]
+
+  # Perform the MR-link-2 estimation
+  mr_result <- mr_link2(
+    selected_eigenvalues = selected_eigenvalues,
+    selected_eigenvectors = selected_eigenvectors,
+    exposure_betas = exposure_betas_pruned,
+    outcome_betas = outcome_betas_pruned,
+    n_exp = n_exp,
+    n_out = n_out,
+    sigma_exp_guess = sigma_exp_guess,
+    sigma_out_guess = sigma_out_guess
+  )
+
+  return(mr_result)
 }
diff --git a/tests/integration_tests_mr_link_2.py b/tests/integration_tests_mr_link_2.py
@@ -13,11 +13,6 @@
 from mr_link_2_standalone import *
 
 
-""""
-NB. These tests require us to have plink in your path. 
-So unfortunately I will not test them in the github workflow.
-"""
-
 def test_plink_version():
     result = subprocess.run(['plink', '--version'], capture_output=True, text=True)
     assert result.returncode == 0, "PLINK is not installed or not in the PATH"

diff --git a/tests/test_for_r_and_python_concordance.py b/tests/test_for_r_and_python_concordance.py
@@ -11,7 +11,8 @@
 ## for the R part
 
 from rpy2.robjects import r, numpy2ri, default_converter
-from rpy2.robjects.vectors import FloatVector
+from rpy2.robjects.vectors import FloatVector, ListVector, StrVector
+
 from rpy2.robjects.packages import importr
 from rpy2.robjects.conversion import localconverter
 
@@ -22,7 +23,7 @@
 # Load the base R package
 base = importr('base')
 # Source the R file with the functions
-r['source']('../R/mr_link_2_functions.R')
+r['source'](os.path.abspath(os.path.join(os.path.dirname(__file__), '..',  'R', 'mr_link_2_functions.R')))
 
 """
 Now, we perform unit tests on the MR-link-2 functions. to ensure that there is no creep of the 
@@ -423,3 +424,84 @@ def test_mr_link2_equivalence_fuzz(selected_eigenvalues, selected_eigenvectors,
     for key in low_precision_keys:
         assert np.isclose(python_result[key], r_result_dict[key], rtol=2.5e-2, ## this is pretty suboptimal, but as this also includes P values, there will not be a crazy amount of difference here.
                           atol=1e-8), f"Mismatch for {key}: Python={python_result[key]}, R={r_result_dict[key]}"
+
+
+def test_remove_highly_correlated():
+    """Test that the highly correlated SNPs are correctly removed."""
+
+    # Example LD matrix with correlation values
+    ld_matrix = np.array([[1.0, 0.9, 0.1],
+                          [0.9, 1.0, 0.05],
+                          [0.1, 0.05, 1.0]])
+
+    snp_ordering = ["SNP1", "SNP2", "SNP3"]
+    max_correlation = 0.8  # Set the maximum correlation threshold
+
+    with localconverter(numpy2ri.converter):
+        r_ld_matrix = r.matrix(ld_matrix, nrow=3)
+        # Wrap the snp_ordering list into a dictionary format for ListVector
+        r_snp_ordering = StrVector(snp_ordering)
+
+        # Call the R function `remove_highly_correlated`
+        r_pruned_result = r['remove_highly_correlated'](r_ld_matrix, r_snp_ordering, max_correlation)
+
+    # Extract results from R output
+    pruned_ld_matrix = np.array(r_pruned_result['ld_matrix'])
+    pruned_snps = list(r_pruned_result['pruned_snps'])
+
+    # Expected pruned results (SNP2 is removed)
+    expected_pruned_ld_matrix = np.array([[1.0, 0.05], [0.05, 1.0]])
+    expected_pruned_snps = ["SNP2", "SNP3"]
+
+    # Assertions
+    assert np.allclose(pruned_ld_matrix, expected_pruned_ld_matrix), "LD matrix was not pruned correctly."
+    assert pruned_snps == expected_pruned_snps, f"SNP pruning failed. Expected {expected_pruned_snps}, got {pruned_snps}"
+
+# Test function for eigenvalue decomposition and MR-link-2 analysis
+def test_mr_link2_analysis():
+    """Test the MR-link-2 analysis on synthetic data."""
+
+    # Synthetic data
+    np.random.seed(42)
+    exposure_betas = np.random.normal(size=100)
+    outcome_betas = np.random.normal(size=100)
+    ld_matrix = np.random.normal(0, 1, (100, 100))
+    ld_matrix = (ld_matrix + ld_matrix.T) / 2  # Make it symmetric
+    np.fill_diagonal(ld_matrix, 1)  # Ensure diagonals are 1s (perfect correlation with self)
+
+    n_exp = 1000
+    n_out = 1000
+    max_correlation = 0.9
+
+    # Convert inputs to R-compatible types
+    with localconverter(numpy2ri.converter):
+        r_exposure_betas = FloatVector(exposure_betas)
+        r_outcome_betas = FloatVector(outcome_betas)
+        r_ld_matrix = r.matrix(ld_matrix, nrow=100)
+        r_n_exp = n_exp
+        r_n_out = n_out
+        r_max_correlation = max_correlation
+
+        # Call the R function `mr_link2_analysis`
+        r_mr_result = r['mr_link2_analysis'](r_exposure_betas, r_outcome_betas, r_ld_matrix, r_n_exp, r_n_out,
+                                             r_max_correlation)
+
+    # Convert the result back to a Python-friendly format
+    mr_result_dict = dict(r_mr_result)
+
+    # Check the output has expected keys and ranges
+    assert 'alpha' in mr_result_dict, "Key 'alpha' missing in MR result."
+    assert 'p(alpha)' in mr_result_dict, "Key 'p(alpha)' missing in MR result."
+    assert 'sigma_x' in mr_result_dict, "Key 'sigma_x' missing in MR result."
+
+    # Example value checks (can be replaced with actual expected ranges based on real data)
+    assert np.isfinite(mr_result_dict['alpha']), "Alpha value is not finite."
+    assert 0 <= mr_result_dict['p(alpha)'] <= 1, "P-value for alpha is out of bounds."
+    assert mr_result_dict['sigma_x'] > 0, "Sigma_x should be positive."
+
+    print("MR-link-2 result passed all checks.")
+
+
+# Run the tests
+if __name__ == '__main__':
+    pytest.main()