diff --git a/scr/Makefile b/scr/Makefile index b9e696a..750ae7a 100644 --- a/scr/Makefile +++ b/scr/Makefile @@ -11,8 +11,8 @@ ARMALIB = /net/mulan/home/yasheng/Cpp/arma/lib # Put C++ complier CXX = g++ #CXX = clang++-11 -CPP_FILES = main_dbslmm.cpp dtpr.cpp dbslmm.cpp dbslmmfit.cpp calc_asymptotic_variance.cpp subset_to_test_and_training.cpp -HPP_FILES = dtpr.hpp dbslmm.hpp dbslmmfit.hpp calc_asymptotic_variance.hpp subset_to_test_and_training.hpp +CPP_FILES = main_dbslmm.cpp dtpr.cpp dbslmm.cpp dbslmmfit.cpp calc_asymptotic_variance.cpp subset_to_test_and_training.cpp helpers.cpp +HPP_FILES = dtpr.hpp dbslmm.hpp dbslmmfit.hpp calc_asymptotic_variance.hpp subset_to_test_and_training.hpp helpers.hpp diff --git a/scr/dbslmm.cpp b/scr/dbslmm.cpp index a05c8c0..dad8a40 100644 --- a/scr/dbslmm.cpp +++ b/scr/dbslmm.cpp @@ -265,13 +265,16 @@ void DBSLMM::BatchRun(PARAM &cPar) { std::vector base_nums = readTestBim(cPar.dat_str + ".bim"); // vector test_inter_s = makePosObjectForTestBim(base_nums, inter_s); - + cout << "test_inter_s[1].snp: " << test_inter_s[1].snp << endl; + cout << "test_inter_s[1].pos: " << test_inter_s[1].pos << endl; + cout << "test_inter_s[1].ps: " << test_inter_s[1].ps << endl; vector info_s; vector test_info_s; int num_block_s = cSP.addBlock(inter_s, block_dat, info_s); //addBlock is defined in scr/dtpr.cpp & populates info_s int test_num_block_s = cSP.addBlock(test_inter_s, block_dat, test_info_s); //addBlock is defined in scr/dtpr.cpp & populates info_s - + cout << "test_num_block_s: " << test_num_block_s << endl; + cout << "test_info_s[1].pos: " << test_info_s[1].pos << endl; // output samll effect badsnps string badsnps_str = cPar.eff + ".badsnps"; @@ -341,7 +344,7 @@ void DBSLMM::BatchRun(PARAM &cPar) { eff_s, eff_l, test_indicator, - cPar.dat_str, + cPar.dat_str, //no file ending, like ".bed" or ".bim" test_info_s, test_info_l); double time_fitting = cIO.getWalltime() - t_fitting; diff --git a/scr/dbslmmfit.cpp b/scr/dbslmmfit.cpp index 26ee0b4..71fb4f9 100644 --- a/scr/dbslmmfit.cpp +++ b/scr/dbslmmfit.cpp @@ -29,6 +29,7 @@ along with this program. If not, see . #include "dbslmmfit.hpp" #include "calc_asymptotic_variance.hpp" #include "subset_to_test_and_training.hpp" +#include "helpers.hpp" using namespace std; using namespace arma; @@ -48,6 +49,8 @@ using namespace arma; //' @param eff_l large effects SNP effects object //' @param test_indicator vector containing ones and zeroes for indicating which subjects are in the test set. //' @param genotypes_str a string containing the file path to the file containing genotypes for test and training subjects +//' @param test_info_s info object for small effect SNPs from the test set +//' @param test_info_l info object for the large effect SNPs from test set //' @return zero is returned // estimate large and small effect int DBSLMMFIT::est(int n_ref, @@ -68,163 +71,132 @@ int DBSLMMFIT::est(int n_ref, ){ // get the maximum number of each block - int count_s = 0, count_l = 0; - vec num_s = zeros(num_block), num_l = zeros(num_block); - for (int i = 0; i < num_block; i++) { - for (size_t j = count_s; j < info_s.size(); j++) { - if(info_s[j].block == i){ - num_s(i) += 1; - count_s++; - }else{ - break; - } - } - for (size_t j = count_l; j < info_l.size(); j++) { - if(info_l[j].block == i){ - num_l(i) += 1; - count_l++; - }else{ - break; - } - } - }// end of counting and populating num_l and num_s, ie, iterates over i - count_l = count_s = 0; // reset + vec num_s = count_snps_per_block(num_block, info_s); + vec num_l = count_snps_per_block(num_block, info_l); + vec test_num_s = count_snps_per_block(num_block, test_info_s); + vec test_num_l = count_snps_per_block(num_block, test_info_l); + + double len_l = num_l.max(); //len_l is the maximum, across blocks, of the per-block number of large effect SNPs double len_s = num_s.max(); //len_s is the max of the per-block number of small effect SNPs + double test_len_l = test_num_l.max(); //len_l is the maximum, across blocks, of the per-block number of large effect SNPs + double test_len_s = test_num_s.max(); //len_s is the max of the per-block number of small effect SNPs int B = 0; int B_MAX = 60; // number of blocks to batch at one time - if (num_block < 60){ //num_block is the number of blocks across the genome. + if (num_block < 60){ //num_block is the number of blocks across the chromosome. B_MAX = num_block; } // pseudo INFO - INFO info_pseudo; - info_pseudo.snp = "rs"; - info_pseudo.ps = 0; - info_pseudo.pos = 0; - info_pseudo.block = 0; - info_pseudo.a1 = "Y"; - info_pseudo.maf = 0; - info_pseudo.z = 0; - info_pseudo.P = 0; - + INFO info_pseudo = make_pseudo_info(); + // loop // vector < vector > info_s_Block(B_MAX, vector ((int)len_s)), info_l_Block(B_MAX, vector ((int)len_l)); vector < vector > info_s_Block(B_MAX, vector ((int)len_s)), info_l_Block(B_MAX, vector ((int)len_l)), test_info_s_Block(B_MAX, vector ((int)len_s)), test_info_l_Block(B_MAX, vector ((int)len_l)); vector < vector > eff_s_Block(B_MAX, vector ((int)len_s)), eff_l_Block(B_MAX, vector ((int)len_l)); vector num_s_vec, num_l_vec; + vector test_num_s_vec, test_num_l_vec; + + unsigned int n_test = sum_vec(test_indicator); + int count_s = 0; + int count_l = 0; + arma::mat diags = zeros(n_test, num_block); //specify dims of diags - unsigned int n_test = std::accumulate(test_indicator.begin(), test_indicator.end(), - decltype(test_indicator)::value_type(0)); - arma::vec diags = zeros(n_test); //specify length of diags - for (int i = 0; i < num_block; ++i) { - // small effect SNP information - vector info_s_block; - for (size_t j = count_s; j < info_s.size(); j++) { - if(info_s[j].block == i){ - info_s_block.push_back(info_s[j]); - count_s++; - }else{ - break; - } - } //end loop for populating info_s_block - count_s = 0; - int test_count_s = 0; - int test_count_l = 0; - - vector test_info_s_block; - for (size_t j = count_s; j < test_info_s.size(); j++) { - if(test_info_s[j].block == i){ - test_info_s_block.push_back(test_info_s[j]); - test_count_s++; - }else{ - break; - } - } //end loop for populating test_info_s_block - for (size_t k = 0; k < info_s_block.size(); k++) - // info_s_Block[B][k] = &info_s_block[k]; - info_s_Block[B][k] = info_s_block[k]; - for (size_t k = 0; k < test_info_s_block.size(); k++) - // info_s_Block[B][k] = &info_s_block[k]; - test_info_s_Block[B][k] = test_info_s_block[k]; - - num_s_vec.push_back((int)num_s(i)); - // large effect SNP information - if (num_l(i) == 0){ - // info_l_Block[B][0] = &info_pseudo; - info_l_Block[B][0] = info_pseudo; - test_info_l_Block[B][0] = info_pseudo; - }else{ - vector info_l_block; - for (size_t j = count_l; j < info_l.size(); j++) { - if(info_l[j].block == i){ - info_l_block.push_back(info_l[j]); - count_l++; - }else{ - break; - } - } - for (size_t k = 0; k < info_l_block.size(); k++) - info_l_Block[B][k] = info_l_block[k]; - // test_info_l_block - vector test_info_l_block; - for (size_t j = test_count_l; j < test_info_l.size(); j++) { - if(test_info_l[j].block == i){ - test_info_l_block.push_back(test_info_l[j]); - test_count_l++; - }else{ - break; - } - } - for (size_t k = 0; k < test_info_l_block.size(); k++) - info_l_Block[B][k] = test_info_l_block[k]; - } - num_l_vec.push_back((int)num_l(i)); - B++; - if (B == B_MAX || i + 1 == num_block) { // process the block of SNPs using multi-threading - omp_set_num_threads(thread); + + for (int i = 0; i < num_block; ++i) { + // small effect SNP information + vector info_s_block; + for (size_t j = count_s; j < info_s.size(); j++) { + if(info_s[j].block == i){ + info_s_block.push_back(info_s[j]); + count_s++; + }else{ + break; + } + } + for (size_t k = 0; k < info_s_block.size(); k++) + // info_s_Block[B][k] = &info_s_block[k]; + + + + info_s_Block[B][k] = info_s_block[k]; +num_s_vec.push_back((int)num_s(i)); + + // large effect SNP information + if (num_l(i) == 0){ + // info_l_Block[B][0] = &info_pseudo; + info_l_Block[B][0] = info_pseudo; + }else{ + vector info_l_block; + for (size_t j = count_l; j < info_l.size(); j++) { + if(info_l[j].block == i){ + info_l_block.push_back(info_l[j]); + count_l++; + }else{ + break; + } + } + for (size_t k = 0; k < info_l_block.size(); k++) + // info_l_Block[B][k] = &info_l_block[k]; + info_l_Block[B][k] = info_l_block[k]; + } + num_l_vec.push_back((int)num_l(i)); + + B++; + if (B == B_MAX || i + 1 == num_block) { // process the block of SNPs using multi-threading + + omp_set_num_threads(thread); #pragma omp parallel for schedule(dynamic) - for (int b = 0; b < B; b++){ - cout << "call calcBlock... b has value:"<< b << endl; - diags += calcBlock(n_ref, - n_obs, - sigma_s, - idv, - bed_str, - info_s_Block[b], + for (int b = 0; b < B; b++){ + cout << "call calcBlock... b has value:"<< b << endl; + arma::vec cb_out = calcBlock(n_ref, + n_obs, + sigma_s, + idv, + bed_str, + info_s_Block[b], info_l_Block[b], - num_s_vec[b], - num_l_vec[b], - eff_s_Block[b], - eff_l_Block[b], + num_s_vec[b], + num_l_vec[b], + eff_s_Block[b], + eff_l_Block[b], test_indicator, - genotypes_str, - test_info_s_Block[b], + genotypes_str, + test_info_s_Block[b], test_info_l_Block[b]); - // int index = floor(i / B_MAX) * B_MAX + b; + cout << "size of cb_out: " << cb_out.size() <(num_block); - for (int i = 0; i < num_block; i++) { - for (size_t j = count_s; j < info_s.size(); j++) { - if(info_s[j].block == i){ - num_s(i) += 1; - count_s++; - }else{ - break; - } - } - } - count_s = 0; // reset - int test_count_s = 0; - for (int i = 0; i < num_block; i++) { - for (size_t j = test_count_s; j < test_info_s.size(); j++) { - if(test_info_s[j].block == i){ - num_s(i) += 1; - test_count_s++; - }else{ - break; - } - } - } - test_count_s = 0; // reset - + arma::vec num_s = count_snps_per_block(num_block, info_s); + cout << "num_block: " << num_block < > info_s_Block(B_MAX, vector ((int)len_s)); vector < vector > info_s_Block(B_MAX, vector ((int)len_s)); - vector < vector > test_info_s_Block(B_MAX, vector ((int)len_s)); + vector < vector > test_info_s_Block(B_MAX, vector ((int)test_len_s)); vector < vector > eff_s_Block(B_MAX, vector ((int)len_s)); vector num_s_vec; - unsigned int n_test = std::accumulate(test_indicator.begin(), test_indicator.end(), - decltype(test_indicator)::value_type(0)); + vector test_num_s_vec; + // + unsigned int n_test = sum_vec(test_indicator);//test_indicator contains only ones and zeros cout << "n_test: " << n_test << endl; - arma::vec diags = zeros(n_test); //specify length of diags & set all to zeros - - for (int i = 0; i < num_block; ++i) { - // small effect SNP information - vector info_s_block; - for (size_t j = count_s; j < info_s.size(); j++) { - if(info_s[j].block == i){ - info_s_block.push_back(info_s[j]); - count_s++; - }else{ - break; - } - } - for (size_t k = 0; k < info_s_block.size(); k++) - // info_s_Block[B][k] = &info_s_block[k]; - info_s_Block[B][k] = info_s_block[k]; - vector test_info_s_block; - test_count_s = 0; - for (size_t j = test_count_s; j < test_info_s.size(); j++) { - if(test_info_s[j].block == i){ - test_info_s_block.push_back(test_info_s[j]); - test_count_s++; - }else{ - break; - } - } - for (size_t k = 0; k < info_s_block.size(); k++) - // info_s_Block[B][k] = &info_s_block[k]; - info_s_Block[B][k] = info_s_block[k]; - num_s_vec.push_back((int)num_s(i)); - - B++; - if (B == B_MAX || i + 1 == num_block) { // process the block of SNPs using multi-threading - omp_set_num_threads(thread); + arma::mat diags = zeros(n_test, num_block); //specify length of diags & set all entries to zeros + cout << "dimensions of diags: " << diags.n_rows << " rows and cols: " << diags.n_cols << endl; + int count_s = 0; + for (int i = 0; i < num_block; ++i) { + // small effect SNP information + vector info_s_block; + for (size_t j = count_s; j < info_s.size(); j++) { + if(info_s[j].block == i){ + info_s_block.push_back(info_s[j]); + count_s++; + }else{ + break; + } + } + for (size_t k = 0; k < info_s_block.size(); k++) + // info_s_Block[B][k] = &info_s_block[k]; + info_s_Block[B][k] = info_s_block[k]; + num_s_vec.push_back((int)num_s(i)); + + B++; + if (B == B_MAX || i + 1 == num_block) { // process the block of SNPs using multi-threading + + omp_set_num_threads(thread); #pragma omp parallel for schedule(dynamic) - for (int b = 0; b < B; b++){ - diags += calcBlock(n_ref, - n_obs, - sigma_s, - idv, - bed_str, + for (int b = 0; b < B; b++){ + arma::vec cb_out = calcBlock(n_ref, + n_obs, + sigma_s, + idv, + bed_str, info_s_Block[b], - num_s_vec[b], - eff_s_Block[b], - test_indicator, - genotypes_str, + num_s_vec[b], + eff_s_Block[b], + test_indicator, + genotypes_str, test_info_s_Block[b]); - //cout <<"index: " << index << endl; - } - // eff of small effect SNPs - for (int r = 0; r < B; r++) { - for (int l = 0; l < num_s_vec[r]; l++){ - eff_s.push_back(eff_s_Block[r][l]); - } + cout << "size of cb_out: " << cb_out.size() < test_info_s_block_full, vector test_info_l_block_full ){ - cout << "starting line 1 of calcBlock..."<< endl; SNPPROC cSP; IO cIO; ifstream bed_in(bed_str.c_str(), ios::binary); //open stream for observed data (not the reference panel) - ifstream dat_in(genotypes_str.c_str(), ios::binary); + string test_bed = genotypes_str + ".bed"; + ifstream dat_in(test_bed.c_str(), ios::binary); // INFO small effect SNPs // vector info_s_block(num_s_block); @@ -560,7 +500,8 @@ arma::vec DBSLMMFIT::calcBlock(int n_ref, IO cIO; ifstream bed_in(bed_str.c_str(), ios::binary); //open stream for observed data (not the reference panel) - ifstream dat_in(genotypes_str.c_str(), ios::binary); + string test_bed = genotypes_str + ".bed"; + ifstream dat_in(test_bed.c_str(), ios::binary); // INFO small effect SNPs // vector info_s_block(num_s_block); // for (int i = 0; i < num_s_block; i++) diff --git a/scr/helpers.cpp b/scr/helpers.cpp new file mode 100644 index 0000000..3413827 --- /dev/null +++ b/scr/helpers.cpp @@ -0,0 +1,92 @@ +#include + +#include "helpers.hpp" +#include "dtpr.hpp" + +using namespace std; +using namespace arma; + + +//' Count the number of SNPs in each block on one chromosome +//' +//' @param num_block number of blocks on the chromosome +//' @param info a vector for small or large effect SNPs on that chromosome +//' @return an armadillo vector with one entry per block; that entry is the number of SNPs in the block + +arma::vec count_snps_per_block(int num_block, vector info){ + int count = 0; + arma::vec num = zeros(num_block); + for (int i = 0; i < num_block; i++) { + for (size_t j = count; j < info.size(); j++) { + if(info[j].block == i){ + num(i) += 1; + count++; + }else{ + break; + } + } + } + return num; +} + +//' Sum entries in a vector with only nonnegative integer entries +//' +//' @param vv vector to be summed +//' @return unsigned int that is the sum of the entries in the vector + +unsigned int sum_vec(vector vv){ + unsigned int result = std::accumulate(vv.begin(), vv.end(), + decltype(vv)::value_type(0)); + return result; +} + +//' Subset info object for a whole chromosome to info object for a single block only +//' +//' @param info a vector object +//' @param block_num integer indicating which block it is +//' @return a vector object for exactly one block + +vector subset_info_by_block(vector info, int block_num){ + vector result; + int count_s = 0; + for (size_t j = count_s; j < info.size(); j++) { + if(info[j].block == block_num){ + result.push_back(info[j]); + count_s++; + }else{ + break; + } + } + return result; +} + + +//' Populate vector of vector of INFO +//' +//' @param info a vector object +//' @return +//' +vector populate_vector_vector_info(vector info, int max_length){ + vector result(max_length); + for (size_t k = 0; k < info.size(); k++) + // info_s_Block[B][k] = &info_s_block[k]; + result[k] = info[k]; + return result; +} + +//' Prepare pseudo INFO object +//' +//' @return a pseudo INFO object + +INFO make_pseudo_info(){ + INFO info_pseudo; + info_pseudo.snp = "rs"; + info_pseudo.ps = 0; + info_pseudo.pos = 0; + info_pseudo.block = 0; + info_pseudo.a1 = "Y"; + info_pseudo.maf = 0; + info_pseudo.z = 0; + info_pseudo.P = 0; + return info_pseudo; +} diff --git a/scr/helpers.hpp b/scr/helpers.hpp new file mode 100644 index 0000000..5af4c9f --- /dev/null +++ b/scr/helpers.hpp @@ -0,0 +1,18 @@ +#include + + +#include "dtpr.hpp" + +using namespace std; +using namespace arma; + +arma::vec count_snps_per_block(int num_block, vector info); + +unsigned int sum_vec(vector vv); + +vector subset_info_by_block(vector info, int block_num); + +vector populate_vector_vector_info(vector info, int max_length); + +INFO make_pseudo_info(); +