Skip to content

Commit ea3dc4b

Browse files
committed
New updates to LDMatrix + SumstatsTable data structures.
1 parent e6d4481 commit ea3dc4b

File tree

7 files changed

+288
-33
lines changed

7 files changed

+288
-33
lines changed

CHANGELOG.md

+6
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,12 @@ name (before it was returning true for inliers...).
2222

2323
- Added `get_peak_memory_usage` to `system_utils` to inspect peak memory usage of a process.
2424
- Placeholder method to perform QC on `SumstatsTable` objects (needs to be implemented still).
25+
- New attached dataset for long-range LD regions.
26+
- New method in SumstatsTable to impute rsID (if missing).
27+
- Preliminary support for matching with CHR+POS in SumstatsTable (still needs more work).
28+
- LDMatrix updates:
29+
- New method to filter long-range LD regions.
30+
- New method to prune LD matrix.
2531
- New algorithm for symmetrizing upper triangular and block diagonal LD matrices.
2632
- Much faster and more memory efficient than using `scipy`.
2733
- New `LDMatrix` class has efficient data loading in `.load_data` method.

magenpy/GWADataLoader.py

+63-15
Original file line numberDiff line numberDiff line change
@@ -548,7 +548,10 @@ def read_summary_statistics(self,
548548
desc="Reading summary statistics",
549549
disable=not self.verbose or len(sumstats_files) < 2):
550550

551-
ss_tab = SumstatsTable.from_file(f, sumstats_format=sumstats_format, parser=parser, **parse_kwargs)
551+
ss_tab = SumstatsTable.from_file(f,
552+
sumstats_format=sumstats_format,
553+
parser=parser,
554+
**parse_kwargs)
552555

553556
if drop_duplicated:
554557
ss_tab.drop_duplicates()
@@ -565,6 +568,19 @@ def read_summary_statistics(self,
565568

566569
self.sumstats_table.update(ss_tab.split_by_chromosome(snps_per_chrom=ref_table))
567570

571+
# If SNP information is not present in the sumstats tables, try to impute it
572+
# using other reference tables:
573+
574+
missing_snp = any('SNP' not in ss.table.columns for ss in self.sumstats_table.values())
575+
576+
if missing_snp and (self.genotype is not None or self.ld is not None):
577+
578+
ref_table = self.to_snp_table(col_subset=['CHR', 'POS', 'SNP'], per_chromosome=True)
579+
580+
for c, ss in self.sumstats_table.items():
581+
if 'SNP' not in ss.table.columns and c in ref_table:
582+
ss.infer_snp_id(ref_table[c], allow_na=True)
583+
568584
def read_ld(self, ld_store_paths):
569585
"""
570586
Read the LD matrix files stored on-disk in Zarr array format.
@@ -672,6 +688,10 @@ def harmonize_data(self):
672688
However, if you read or manipulate the data sources after initialization,
673689
you may need to call this method again to ensure that the data sources remain aligned.
674690
691+
!!! warning
692+
Harmonization for now depends on having SNP rsID be present in all the resources. Hopefully
693+
this requirement will be relaxed in the future.
694+
675695
"""
676696

677697
data_sources = (self.genotype, self.sumstats_table, self.ld, self.annotation)
@@ -705,8 +725,8 @@ def harmonize_data(self):
705725

706726
else:
707727

708-
# Find the set of SNPs that are shared across all data sources:
709-
common_snps = np.array(list(set.intersection(*[set(ds[c].snps)
728+
# Find the set of SNPs that are shared across all data sources (exclude missing values):
729+
common_snps = np.array(list(set.intersection(*[set(ds[c].snps[~pd.isnull(ds[c].snps)])
710730
for ds in initialized_data_sources])))
711731

712732
# If necessary, filter the data sources to only have the common SNPs:
@@ -717,10 +737,17 @@ def harmonize_data(self):
717737
# Harmonize the summary statistics data with either genotype or LD reference.
718738
# This procedure checks for flips in the effect allele between data sources.
719739
if self.sumstats_table is not None:
740+
741+
id_cols = self.sumstats_table[c].identifier_cols
742+
720743
if self.genotype is not None:
721-
self.sumstats_table[c].match(self.genotype[c].get_snp_table(col_subset=['SNP', 'A1', 'A2']))
744+
self.sumstats_table[c].match(self.genotype[c].get_snp_table(
745+
col_subset=id_cols + ['A1', 'A2']
746+
))
722747
elif self.ld is not None:
723-
self.sumstats_table[c].match(self.ld[c].to_snp_table(col_subset=['SNP', 'A1', 'A2']))
748+
self.sumstats_table[c].match(self.ld[c].to_snp_table(
749+
col_subset=id_cols + ['A1', 'A2']
750+
))
724751

725752
# If during the allele matching process we discover incompatibilities,
726753
# we filter those SNPs:
@@ -763,15 +790,17 @@ def score(self, beta=None, standardize_genotype=False):
763790
try:
764791
beta = {c: s.marginal_beta or s.get_snp_pseudo_corr() for c, s in self.sumstats_table.items()}
765792
except Exception:
766-
raise ValueError("To perform linear scoring, you must provide effect size estimates (BETA)!")
793+
raise ValueError("To perform linear scoring, you must "
794+
"provide effect size estimates (BETA)!")
767795

768796
# Here, we have a very ugly way of accounting for
769797
# the fact that the chromosomes may be coded differently between the genotype
770798
# and the beta dictionary. Maybe we can find a better solution in the future.
771799
common_chr_g, common_chr_b = match_chromosomes(self.genotype.keys(), beta.keys(), return_both=True)
772800

773801
if len(common_chr_g) < 1:
774-
raise ValueError("No common chromosomes found between the genotype and the effect size estimates!")
802+
raise ValueError("No common chromosomes found between "
803+
"the genotype and the effect size estimates!")
775804

776805
if self.verbose and len(common_chr_g) < 2:
777806
print("> Generating polygenic scores...")
@@ -831,7 +860,7 @@ def to_phenotype_table(self):
831860

832861
return self.sample_table.get_phenotype_table()
833862

834-
def to_snp_table(self, col_subset=None, per_chromosome=False):
863+
def to_snp_table(self, col_subset=None, per_chromosome=False, resource='auto'):
835864
"""
836865
Get a dataframe of SNP data for all variants
837866
across different chromosomes.
@@ -840,21 +869,40 @@ def to_snp_table(self, col_subset=None, per_chromosome=False):
840869
:param per_chromosome: If True, returns a dictionary where the key
841870
is the chromosome number and the value is the SNP table per
842871
chromosome.
872+
:param resource: The data source to extract the SNP table from. By default, the method
873+
will try to extract the SNP table from the genotype matrix. If the genotype matrix is not
874+
available, then it will try to extract the SNP information from the LD matrix or the summary
875+
statistics table. Possible values: `auto`, `genotype`, `ld`, `sumstats`.
843876
844877
:return: A dataframe (or dictionary of dataframes) of SNP data.
845878
"""
846879

880+
# Sanity checks:
881+
assert resource in ('auto', 'genotype', 'ld', 'sumstats')
882+
if resource != 'auto':
883+
if resource == 'genotype' and self.genotype is None:
884+
raise ValueError("Genotype matrix is not available!")
885+
if resource == 'ld' and self.ld is None:
886+
raise ValueError("LD matrix is not available!")
887+
if resource == 'sumstats' and self.sumstats_table is None:
888+
raise ValueError("Summary statistics table is not available!")
889+
else:
890+
if all(ds is None for ds in (self.genotype, self.ld, self.sumstats_table)):
891+
raise ValueError("No data sources available to extract SNP data from!")
892+
893+
# Extract the SNP data:
894+
847895
snp_tables = {}
848896

849-
for c in self.chromosomes:
850-
if self.sumstats_table is not None:
851-
snp_tables[c] = self.sumstats_table[c].to_table(col_subset=col_subset)
852-
elif self.genotype is not None:
897+
if resource in ('auto', 'genotype') and self.genotype is not None:
898+
for c in self.chromosomes:
853899
snp_tables[c] = self.genotype[c].get_snp_table(col_subset=col_subset)
854-
elif self.ld is not None:
900+
elif resource in ('auto', 'ld') and self.ld is not None:
901+
for c in self.chromosomes:
855902
snp_tables[c] = self.ld[c].to_snp_table(col_subset=col_subset)
856-
else:
857-
raise ValueError("GWADataLoader instance is not properly initialized!")
903+
else:
904+
return self.to_summary_statistics_table(col_subset=col_subset,
905+
per_chromosome=per_chromosome)
858906

859907
if per_chromosome:
860908
return snp_tables

magenpy/LDMatrix.py

+58
Original file line numberDiff line numberDiff line change
@@ -796,6 +796,40 @@ def indptr(self):
796796
else:
797797
return self._zg['matrix/indptr']
798798

799+
def filter_long_range_ld_regions(self):
800+
"""
801+
A utility method to exclude variants that are in long-range LD regions. The
802+
boundaries of those regions are derived from here:
803+
804+
https://genome.sph.umich.edu/wiki/Regions_of_high_linkage_disequilibrium_(LD)
805+
806+
Which is based on the work of
807+
808+
> Anderson, Carl A., et al. "Data quality control in genetic case-control association studies." Nature protocols 5.9 (2010): 1564-1573.
809+
810+
.. note ::
811+
This method is experimental and may not work as expected for all LD matrices.
812+
"""
813+
814+
from .parsers.annotation_parsers import parse_annotation_bed_file
815+
from .utils.data_utils import lrld_path
816+
817+
bed_df = parse_annotation_bed_file(lrld_path())
818+
819+
# Filter to only regions specific to the chromosome of this matrix:
820+
bed_df = bed_df.loc[bed_df['CHR'] == self.chromosome]
821+
822+
bp_pos = self.bp_position
823+
snp_mask = np.ones(len(bp_pos), dtype=bool)
824+
825+
# Loop over the LRLD region on this chromosome and exclude the SNPs in these regions:
826+
for _, row in bed_df.iterrows():
827+
start, end = row['Start'], row['End']
828+
snp_mask &= ~((bp_pos >= start) & (bp_pos <= end))
829+
830+
# Filter the SNP to only those not in the LRLD regions:
831+
self.filter_snps(self.snps[snp_mask])
832+
799833
def filter_snps(self, extract_snps=None, extract_file=None):
800834
"""
801835
Filter the LDMatrix to keep a subset of variants. This mainly sets
@@ -859,6 +893,30 @@ def reset_mask(self):
859893
return_symmetric=self.is_symmetric,
860894
dtype=self.dtype)
861895

896+
def prune(self, threshold):
897+
"""
898+
Perform LD pruning to remove variants that are in high LD with other variants.
899+
If two variants are in high LD, this function keeps the variant that occurs
900+
earlier in the matrix. This behavior will be updated in the future to allow
901+
for arbitrary ordering of variants.
902+
903+
!!! note
904+
Experimental for now. Needs further testing & improvement.
905+
906+
:param threshold: The absolute value of the Pearson correlation coefficient above which to prune variants.
907+
:return: A boolean array indicating whether a variant is kept after pruning. A positive floating point number
908+
between 0. and 1.
909+
"""
910+
911+
from .stats.ld.c_utils import prune_ld_ut
912+
913+
assert 0. < threshold <= 1.
914+
915+
if np.issubdtype(self.stored_dtype, np.integer):
916+
threshold = quantize(np.array([threshold]), int_dtype=self.stored_dtype)[0]
917+
918+
return prune_ld_ut(self.indptr[:], self.data[:], threshold)
919+
862920
def to_snp_table(self, col_subset=None):
863921
"""
864922
:param col_subset: The subset of columns to add to the table. If None, it returns

magenpy/SumstatsTable.py

+61-18
Original file line numberDiff line numberDiff line change
@@ -34,12 +34,18 @@ def __init__(self, ss_table: pd.DataFrame):
3434
"""
3535
self.table: pd.DataFrame = ss_table
3636

37-
assert all([col in self.table.columns for col in ('SNP', 'A1')])
37+
# Check that the table contains some of the required columns (non exhaustive):
38+
39+
# Either has SNP or CHR+POS:
40+
assert 'SNP' in self.table.columns or all([col in self.table.columns for col in ('CHR', 'POS')])
41+
# Assert that the table has at least one of the alleles:
42+
assert any([col in self.table.columns for col in ('A1', 'A2')])
43+
# TODO: Add other assertions?
3844

3945
@property
4046
def shape(self):
4147
"""
42-
:return: he shape of the summary statistics table.
48+
:return: The shape of the summary statistics table.
4349
"""
4450
return self.table.shape
4551

@@ -49,7 +55,8 @@ def __len__(self):
4955
@property
5056
def chromosome(self):
5157
"""
52-
A convenience method to return the chromosome number if there is only one chromosome in the summary statistics.
58+
A convenience method to return the chromosome number if there is only
59+
one chromosome in the summary statistics.
5360
If multiple chromosomes are present, it returns None.
5461
5562
:return: The chromosome number if there is only one chromosome in the summary statistics.
@@ -76,6 +83,13 @@ def m(self):
7683
"""
7784
return self.n_snps
7885

86+
@property
87+
def identifier_cols(self):
88+
if 'SNP' in self.table.columns:
89+
return ['SNP']
90+
else:
91+
return ['CHR', 'POS']
92+
7993
@property
8094
def n_snps(self):
8195
"""
@@ -341,21 +355,29 @@ def effect_sign(self):
341355
def infer_a2(self, reference_table, allow_na=False):
342356
"""
343357
Infer the reference allele A2 (if not present in the SumstatsTable)
344-
from a reference table. Make sure that the reference table contains the SNP ID,
345-
the reference allele A2 and the alternative (i.e. effect) allele A1. It is the
346-
user's responsibility to make sure that the reference table matches the summary
347-
statistics in terms of the specification of reference vs. alternative. They are
348-
allowed to be flipped, but they have to be consistent across the two tables.
358+
from a reference table. Make sure that the reference table contains the identifier information
359+
for each SNP, in addition to the reference allele A2 and the alternative (i.e. effect) allele A1.
360+
It is the user's responsibility to make sure that the reference table matches the summary
361+
statistics in terms of the specification of reference vs. alternative. They have to be consistent
362+
across the two tables.
349363
350364
:param reference_table: A pandas table containing the following columns at least:
351-
`SNP`, `A1`, `A2`.
365+
SNP identifiers (`SNP` or `CHR` & `POS`) and allele information (`A1` & `A2`).
352366
:param allow_na: If True, allow the reference allele to be missing from the final result.
353367
"""
354368

355-
# Merge the summary statistics table with the reference table on `SNP` ID:
356-
merged_table = self.table[['SNP', 'A1']].merge(reference_table[['SNP', 'A1', 'A2']],
357-
how='left',
358-
on='SNP')
369+
# Get the identifier columns for this table:
370+
id_cols = self.identifier_cols
371+
372+
# Sanity checks:
373+
assert all([col in reference_table.columns for col in id_cols + ['A1', 'A2']])
374+
375+
# Merge the summary statistics table with the reference table on unique ID:
376+
merged_table = self.table[id_cols + ['A1']].merge(
377+
reference_table[id_cols + ['A1', 'A2']],
378+
how='left',
379+
on=id_cols
380+
)
359381
# If `A1_x` agrees with `A1_y`, then `A2` is indeed the reference allele.
360382
# Otherwise, they are flipped and `A1_y` should be the reference allele:
361383
merged_table['A2'] = np.where(merged_table['A1_x'] == merged_table['A1_y'],
@@ -368,6 +390,25 @@ def infer_a2(self, reference_table, allow_na=False):
368390
else:
369391
self.table['A2'] = merged_table['A2']
370392

393+
def infer_snp_id(self, reference_table, allow_na=False):
394+
"""
395+
Infer the SNP ID (if not present in the SumstatsTable) from a reference table.
396+
Make sure that the reference table contains the SNP ID, chromosome ID, and position.
397+
398+
:param reference_table: A pandas table containing the following columns at least:
399+
`SNP`, `CHR`, `POS`.
400+
:param allow_na: If True, allow the SNP ID to be missing from the final result.
401+
"""
402+
403+
# Merge the summary statistics table with the reference table:
404+
merged_table = self.table[['CHR', 'POS']].merge(reference_table[['SNP', 'CHR', 'POS']], how='left')
405+
406+
# Check that the SNP ID could be inferred for all SNPs:
407+
if not allow_na and merged_table['SNP'].isna().any():
408+
raise ValueError("The SNP ID could not be inferred for some SNPs!")
409+
else:
410+
self.table['SNP'] = merged_table['SNP'].values
411+
371412
def set_sample_size(self, n):
372413
"""
373414
Set the sample size for each variant in the summary table.
@@ -394,14 +435,15 @@ def match(self, reference_table, correct_flips=True):
394435
correcting for potential flips in the effect alleles.
395436
396437
:param reference_table: The SNP table to use as a reference. Must be a pandas
397-
table with at least three columns: SNP, A1, A2.
438+
table with the following columns: SNP identifier (either `SNP` or `CHR` & `POS`) and allele information
439+
(`A1` & `A2`).
398440
:param correct_flips: If True, correct the direction of effect size
399441
estimates if the effect allele is reversed.
400442
"""
401443

402444
from .utils.model_utils import merge_snp_tables
403445

404-
self.table = merge_snp_tables(ref_table=reference_table[['SNP', 'A1', 'A2']],
446+
self.table = merge_snp_tables(ref_table=reference_table[self.identifier_cols + ['A1', 'A2']],
405447
alt_table=self.table,
406448
how='inner',
407449
correct_flips=correct_flips)
@@ -457,14 +499,15 @@ def filter_snps(self, extract_snps=None, extract_file=None, extract_index=None):
457499
self.table = self.table.iloc[extract_index, ].reset_index(drop=True)
458500
else:
459501
raise Exception("To filter a summary statistics table, you must provide "
460-
"the list of SNPs, a file containing the list of SNPs, or a list of indices to retain.")
502+
"the list of SNPs, a file containing the list of SNPs, "
503+
"or a list of indices to retain.")
461504

462505
def drop_duplicates(self):
463506
"""
464507
Drop variants with duplicated rsIDs from the summary statistics table.
465508
"""
466509

467-
self.table = self.table.drop_duplicates(subset='SNP', keep=False)
510+
self.table = self.table.drop_duplicates(subset=self.identifier_cols, keep=False)
468511

469512
def get_col(self, col_name):
470513
"""
@@ -583,7 +626,7 @@ def split_by_chromosome(self, snps_per_chrom=None):
583626
if 'CHR' in self.table.columns:
584627
chrom_tables = self.table.groupby('CHR')
585628
return {
586-
c: SumstatsTable(chrom_tables.get_group(c))
629+
c: SumstatsTable(chrom_tables.get_group(c).copy())
587630
for c in chrom_tables.groups
588631
}
589632
elif snps_per_chrom is not None:

0 commit comments

Comments
 (0)