@@ -548,7 +548,10 @@ def read_summary_statistics(self,
548
548
desc = "Reading summary statistics" ,
549
549
disable = not self .verbose or len (sumstats_files ) < 2 ):
550
550
551
- ss_tab = SumstatsTable .from_file (f , sumstats_format = sumstats_format , parser = parser , ** parse_kwargs )
551
+ ss_tab = SumstatsTable .from_file (f ,
552
+ sumstats_format = sumstats_format ,
553
+ parser = parser ,
554
+ ** parse_kwargs )
552
555
553
556
if drop_duplicated :
554
557
ss_tab .drop_duplicates ()
@@ -565,6 +568,19 @@ def read_summary_statistics(self,
565
568
566
569
self .sumstats_table .update (ss_tab .split_by_chromosome (snps_per_chrom = ref_table ))
567
570
571
+ # If SNP information is not present in the sumstats tables, try to impute it
572
+ # using other reference tables:
573
+
574
+ missing_snp = any ('SNP' not in ss .table .columns for ss in self .sumstats_table .values ())
575
+
576
+ if missing_snp and (self .genotype is not None or self .ld is not None ):
577
+
578
+ ref_table = self .to_snp_table (col_subset = ['CHR' , 'POS' , 'SNP' ], per_chromosome = True )
579
+
580
+ for c , ss in self .sumstats_table .items ():
581
+ if 'SNP' not in ss .table .columns and c in ref_table :
582
+ ss .infer_snp_id (ref_table [c ], allow_na = True )
583
+
568
584
def read_ld (self , ld_store_paths ):
569
585
"""
570
586
Read the LD matrix files stored on-disk in Zarr array format.
@@ -672,6 +688,10 @@ def harmonize_data(self):
672
688
However, if you read or manipulate the data sources after initialization,
673
689
you may need to call this method again to ensure that the data sources remain aligned.
674
690
691
+ !!! warning
692
+ Harmonization for now depends on having SNP rsID be present in all the resources. Hopefully
693
+ this requirement will be relaxed in the future.
694
+
675
695
"""
676
696
677
697
data_sources = (self .genotype , self .sumstats_table , self .ld , self .annotation )
@@ -705,8 +725,8 @@ def harmonize_data(self):
705
725
706
726
else :
707
727
708
- # Find the set of SNPs that are shared across all data sources:
709
- common_snps = np .array (list (set .intersection (* [set (ds [c ].snps )
728
+ # Find the set of SNPs that are shared across all data sources (exclude missing values) :
729
+ common_snps = np .array (list (set .intersection (* [set (ds [c ].snps [ ~ pd . isnull ( ds [ c ]. snps )] )
710
730
for ds in initialized_data_sources ])))
711
731
712
732
# If necessary, filter the data sources to only have the common SNPs:
@@ -717,10 +737,17 @@ def harmonize_data(self):
717
737
# Harmonize the summary statistics data with either genotype or LD reference.
718
738
# This procedure checks for flips in the effect allele between data sources.
719
739
if self .sumstats_table is not None :
740
+
741
+ id_cols = self .sumstats_table [c ].identifier_cols
742
+
720
743
if self .genotype is not None :
721
- self .sumstats_table [c ].match (self .genotype [c ].get_snp_table (col_subset = ['SNP' , 'A1' , 'A2' ]))
744
+ self .sumstats_table [c ].match (self .genotype [c ].get_snp_table (
745
+ col_subset = id_cols + ['A1' , 'A2' ]
746
+ ))
722
747
elif self .ld is not None :
723
- self .sumstats_table [c ].match (self .ld [c ].to_snp_table (col_subset = ['SNP' , 'A1' , 'A2' ]))
748
+ self .sumstats_table [c ].match (self .ld [c ].to_snp_table (
749
+ col_subset = id_cols + ['A1' , 'A2' ]
750
+ ))
724
751
725
752
# If during the allele matching process we discover incompatibilities,
726
753
# we filter those SNPs:
@@ -763,15 +790,17 @@ def score(self, beta=None, standardize_genotype=False):
763
790
try :
764
791
beta = {c : s .marginal_beta or s .get_snp_pseudo_corr () for c , s in self .sumstats_table .items ()}
765
792
except Exception :
766
- raise ValueError ("To perform linear scoring, you must provide effect size estimates (BETA)!" )
793
+ raise ValueError ("To perform linear scoring, you must "
794
+ "provide effect size estimates (BETA)!" )
767
795
768
796
# Here, we have a very ugly way of accounting for
769
797
# the fact that the chromosomes may be coded differently between the genotype
770
798
# and the beta dictionary. Maybe we can find a better solution in the future.
771
799
common_chr_g , common_chr_b = match_chromosomes (self .genotype .keys (), beta .keys (), return_both = True )
772
800
773
801
if len (common_chr_g ) < 1 :
774
- raise ValueError ("No common chromosomes found between the genotype and the effect size estimates!" )
802
+ raise ValueError ("No common chromosomes found between "
803
+ "the genotype and the effect size estimates!" )
775
804
776
805
if self .verbose and len (common_chr_g ) < 2 :
777
806
print ("> Generating polygenic scores..." )
@@ -831,7 +860,7 @@ def to_phenotype_table(self):
831
860
832
861
return self .sample_table .get_phenotype_table ()
833
862
834
- def to_snp_table (self , col_subset = None , per_chromosome = False ):
863
+ def to_snp_table (self , col_subset = None , per_chromosome = False , resource = 'auto' ):
835
864
"""
836
865
Get a dataframe of SNP data for all variants
837
866
across different chromosomes.
@@ -840,21 +869,40 @@ def to_snp_table(self, col_subset=None, per_chromosome=False):
840
869
:param per_chromosome: If True, returns a dictionary where the key
841
870
is the chromosome number and the value is the SNP table per
842
871
chromosome.
872
+ :param resource: The data source to extract the SNP table from. By default, the method
873
+ will try to extract the SNP table from the genotype matrix. If the genotype matrix is not
874
+ available, then it will try to extract the SNP information from the LD matrix or the summary
875
+ statistics table. Possible values: `auto`, `genotype`, `ld`, `sumstats`.
843
876
844
877
:return: A dataframe (or dictionary of dataframes) of SNP data.
845
878
"""
846
879
880
+ # Sanity checks:
881
+ assert resource in ('auto' , 'genotype' , 'ld' , 'sumstats' )
882
+ if resource != 'auto' :
883
+ if resource == 'genotype' and self .genotype is None :
884
+ raise ValueError ("Genotype matrix is not available!" )
885
+ if resource == 'ld' and self .ld is None :
886
+ raise ValueError ("LD matrix is not available!" )
887
+ if resource == 'sumstats' and self .sumstats_table is None :
888
+ raise ValueError ("Summary statistics table is not available!" )
889
+ else :
890
+ if all (ds is None for ds in (self .genotype , self .ld , self .sumstats_table )):
891
+ raise ValueError ("No data sources available to extract SNP data from!" )
892
+
893
+ # Extract the SNP data:
894
+
847
895
snp_tables = {}
848
896
849
- for c in self .chromosomes :
850
- if self .sumstats_table is not None :
851
- snp_tables [c ] = self .sumstats_table [c ].to_table (col_subset = col_subset )
852
- elif self .genotype is not None :
897
+ if resource in ('auto' , 'genotype' ) and self .genotype is not None :
898
+ for c in self .chromosomes :
853
899
snp_tables [c ] = self .genotype [c ].get_snp_table (col_subset = col_subset )
854
- elif self .ld is not None :
900
+ elif resource in ('auto' , 'ld' ) and self .ld is not None :
901
+ for c in self .chromosomes :
855
902
snp_tables [c ] = self .ld [c ].to_snp_table (col_subset = col_subset )
856
- else :
857
- raise ValueError ("GWADataLoader instance is not properly initialized!" )
903
+ else :
904
+ return self .to_summary_statistics_table (col_subset = col_subset ,
905
+ per_chromosome = per_chromosome )
858
906
859
907
if per_chromosome :
860
908
return snp_tables
0 commit comments