Skip to content

Commit 18b3e25

Browse files
committed
tidyup FileBgen
1 parent e47db8d commit 18b3e25

File tree

5 files changed

+28
-74
lines changed

5 files changed

+28
-74
lines changed

.github/workflows/linux.yml

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,8 +23,10 @@ jobs:
2323
- name: make
2424
run: make -j4
2525

26-
- name: download example data
27-
run: make data
26+
- name: download and test example data
27+
run: |
28+
make data
29+
make example_tests
2830
2931
- name: test LD module
3032
run: |

.github/workflows/mac.yml

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ jobs:
1515
- uses: actions/checkout@v3
1616

1717
- name: install dependency
18-
run: brew install libomp
18+
run: brew install libomp wget
1919

2020
- name: make
2121
run: |
@@ -24,4 +24,6 @@ jobs:
2424
make -j4
2525
2626
- name: test
27-
run: ./PCAone --help
27+
run: |
28+
make data
29+
make example_tests

Makefile

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -158,6 +158,11 @@ data:
158158
wget http://popgen.dk/zilong/datahub/pca/example.tar.gz
159159
tar -xzf example.tar.gz && rm -f example.tar.gz
160160

161+
example_tests:
162+
./PCAone -g example/test.bgen -n4 -o m0
163+
./PCAone -g example/test.bgen -n4 -m0.1 -o m1
164+
diff m0.eigvals m1.eigvals
165+
161166
hwe:
162167
./PCAone -b example/plink -k 3 -V
163168
./PCAone -b example/plink --USV pcaone --inbreed 1 -o inbreed_m0
@@ -213,6 +218,3 @@ ld_tests:
213218
./PCAone -B adj.residuals --match-bim adj.mbim --ld-r2 0.8 --ld-bp 1000000 -o adj_prune_m1 -m 2
214219
diff adj_prune_m0.ld.prune.out adj_prune_m1.ld.prune.out > /dev/null
215220

216-
bgen_tests:
217-
./PCAone -g example/test.bgen -n6
218-
./PCAone -g example/test.bgen -m 0.1 -n6

src/FileBgen.cpp

Lines changed: 7 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -129,13 +129,6 @@ void FileBgen::read_all() {
129129
}
130130

131131
void FileBgen::read_block_initial(uint64 start_idx, uint64 stop_idx, bool standardize) {
132-
read_bgen_block(G, F, bg, dosages.data(), frequency_was_estimated, nsamples, nsnps, blocksize, start_idx,
133-
stop_idx, standardize);
134-
}
135-
136-
void read_bgen_block(Mat2D &G, Mat1D &F, bgen::CppBgenReader *bg, float *dosages,
137-
bool &frequency_was_estimated, uint64 nsamples, uint64 nsnps, uint blocksize,
138-
uint64 start_idx, uint64 stop_idx, bool standardize) {
139132
uint actual_block_size = stop_idx - start_idx + 1;
140133
uint i, j, snp_idx;
141134
if (G.cols() < blocksize || (actual_block_size < blocksize)) {
@@ -145,7 +138,7 @@ void read_bgen_block(Mat2D &G, Mat1D &F, bgen::CppBgenReader *bg, float *dosages
145138
for (i = 0; i < actual_block_size; ++i) {
146139
snp_idx = start_idx + i;
147140
auto var = bg->next_var();
148-
var.minor_allele_dosage(dosages);
141+
var.minor_allele_dosage(dosages.data());
149142
#pragma omp parallel for
150143
for (j = 0; j < nsamples; j++) {
151144
if (std::isnan(dosages[j])) {
@@ -163,7 +156,7 @@ void read_bgen_block(Mat2D &G, Mat1D &F, bgen::CppBgenReader *bg, float *dosages
163156
for (i = 0; i < actual_block_size; ++i) {
164157
snp_idx = start_idx + i;
165158
auto var = bg->next_var();
166-
var.minor_allele_dosage(dosages);
159+
var.minor_allele_dosage(dosages.data());
167160
gc = 0;
168161
gs = 0.0;
169162
#pragma omp parallel for reduction(+ : gc) reduction(+ : gs)
@@ -194,53 +187,11 @@ void read_bgen_block(Mat2D &G, Mat1D &F, bgen::CppBgenReader *bg, float *dosages
194187
}
195188
}
196189

197-
// this would be fast
198-
int shuffle_bgen_to_bin(std::string &fin, std::string fout, uint gb, bool standardize) {
199-
cao.print(tick.date(), "begin to permute BGEN into BINARY file");
200-
bgen::CppBgenReader *bg = new bgen::CppBgenReader(fin, "", true);
201-
uint nsamples = bg->header.nsamples;
202-
uint nsnps = bg->header.nvariants;
203-
const uint ibyte = 4;
204-
uint64 bytes_per_snp = nsamples * ibyte;
205-
uint blocksize = 1073741824 * gb / bytes_per_snp;
206-
uint nblocks = (nsnps + blocksize - 1) / blocksize;
207-
std::ofstream ofs(fout + ".perm.bin", std::ios::binary);
208-
std::ofstream ofs2(fout + ".perm.txt");
209-
ofs.write((char *)&nsnps, ibyte);
210-
ofs.write((char *)&nsamples, ibyte);
211-
uint magic = ibyte * 2;
212-
std::vector<float> dosages(nsamples);
213-
214-
bool frequency_was_estimated = false;
215-
std::vector<uint> perm(nsnps);
216-
std::iota(perm.begin(), perm.end(), 0);
217-
auto rng = std::default_random_engine{};
218-
std::shuffle(perm.begin(), perm.end(), rng);
219-
Mat2D G;
220-
Mat1D F(nsnps);
221-
uint64 idx, cur = 0;
222-
for (uint i = 0; i < nblocks; i++) {
223-
auto start_idx = i * blocksize;
224-
auto stop_idx = start_idx + blocksize - 1;
225-
stop_idx = stop_idx >= nsnps ? nsnps - 1 : stop_idx;
226-
read_bgen_block(G, F, bg, dosages.data(), frequency_was_estimated, nsamples, nsnps, blocksize,
227-
start_idx, stop_idx, standardize);
228-
for (Eigen::Index p = 0; p < G.cols(); p++, cur++) {
229-
ofs2 << perm[cur] << "\n";
230-
idx = magic + perm[cur] * bytes_per_snp;
231-
ofs.seekp(idx, std::ios_base::beg);
232-
ofs.write((char *)G.col(p).data(), bytes_per_snp);
233-
}
234-
}
235-
delete bg;
236-
fin = fout + ".perm.bin";
237-
return (nsnps == cur);
238-
}
239-
240190
void permute_bgen_thread(std::vector<int> idx, std::string fin, std::string fout, int ithread) {
241191
fout = fout + ".perm." + to_string(ithread) + ".bgen";
242-
bgen::CppBgenReader br(fin, "", false); // will call parse_all_variants();
243-
bgen::CppBgenWriter bw(fout, br.header.nsamples, br.header.extra, br.header.compression, br.header.layout, br.samples.samples);
192+
bgen::CppBgenReader br(fin, "", false); // will call parse_all_variants();
193+
bgen::CppBgenWriter bw(fout, br.header.nsamples, br.header.extra, br.header.compression, br.header.layout,
194+
br.samples.samples);
244195
for (auto i : idx) {
245196
auto var = br.variants[i];
246197
std::vector<std::uint8_t> data = var.copy_data();
@@ -253,7 +204,8 @@ PermMat permute_bgen(std::string &fin, std::string fout, int nthreads) {
253204
bgen::CppBgenReader br(fin, "", true);
254205
uint nsnps = br.header.nvariants;
255206
std::string out = fout + ".perm.bgen";
256-
bgen::CppBgenWriter bw(out, br.header.nsamples, br.header.extra, br.header.compression, br.header.layout, br.samples.samples);
207+
bgen::CppBgenWriter bw(out, br.header.nsamples, br.header.extra, br.header.compression, br.header.layout,
208+
br.samples.samples);
257209
vector<int> perm(nsnps);
258210
std::iota(perm.begin(), perm.end(), 0);
259211
auto rng = std::default_random_engine{};

src/FileBgen.hpp

Lines changed: 8 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -9,15 +9,6 @@
99
// const double BGEN_MISSING_VALUE = -9;
1010
// const double BGEN2GENO[4] = {0, 0.5, 1, BGEN_MISSING_VALUE};
1111

12-
void read_bgen_block(Mat2D& G, Mat1D& F, bgen::CppBgenReader* bg, float* dosages,
13-
bool& frequency_was_estimated, uint64 nsamples, uint64 nsnps, uint blocksize,
14-
uint64 start_idx, uint64 stop_idx, bool standardize);
15-
16-
int shuffle_bgen_to_bin(std::string& fin, std::string fout, uint gb, bool standardize);
17-
18-
void permute_bgen_thread(std::vector<int> idx, std::string fin, std::string fout, int ithread);
19-
20-
PermMat permute_bgen(std::string& fin, std::string fout, int nthreads);
2112

2213
class FileBgen : public Data {
2314
public:
@@ -28,9 +19,10 @@ class FileBgen : public Data {
2819
nsamples = bg->header.nsamples;
2920
nsnps = bg->header.nvariants;
3021
dosages.resize(nsamples);
31-
F = Mat1D::Zero(nsnps); // initial F
32-
cao.print(tick.date(), "the layout is", bg->header.layout, ", compression is", bg->header.compression , ". N (#samples):", nsamples,
33-
". M (#SNPs):", nsnps);
22+
F = Mat1D::Zero(nsnps); // initial F
23+
cao.print(tick.date(), "N(#samples) =", nsamples, ", M(#SNPs) =", nsnps);
24+
cao.print(tick.date(), "the layout is", bg->header.layout, ", compressed by",
25+
bg->header.compression == 2 ? "zstd" : "zlib");
3426
}
3527

3628
~FileBgen() { delete bg; }
@@ -50,4 +42,8 @@ class FileBgen : public Data {
5042
bool frequency_was_estimated = false;
5143
};
5244

45+
void permute_bgen_thread(std::vector<int> idx, std::string fin, std::string fout, int ithread);
46+
47+
PermMat permute_bgen(std::string& fin, std::string fout, int nthreads);
48+
5349
#endif // PCAONE_FILEBGEN_

0 commit comments

Comments
 (0)