PGScatalog · fyvon · Nov 7, 2024 · Oct 17, 2024 · Oct 17, 2024 · Oct 18, 2024
diff --git a/curation/parsers/performance.py b/curation/parsers/performance.py
@@ -52,7 +52,7 @@ def str2metric(self, field, val):
         val = self.replace_non_ascii_chars(field,val)
 
         # Estimate with percentage as unit
-        if re.match('^\d+\.?\d*\s*\%$',val):
+        if re.match(r'^\d+\.?\d*\s*\%$',val):
             val = val.replace('%','').strip()
             current_metric.add_data('estimate', val)
             current_metric.add_data('unit', '%')

diff --git a/curation/parsers/sample.py b/curation/parsers/sample.py
@@ -18,7 +18,7 @@ def str2demographic(self, field, val):
         - val: data value
         Return type: DemographicData object
         '''
-        unit_regex = "([-+]?\d*\.\d+|\d+) ([a-zA-Z]+)"
+        unit_regex = r"([-+]?\d*\.\d+|\d+) ([a-zA-Z]+)"
         current_demographic = DemographicData(field,val,self.spreadsheet_name)
         if type(val) == float:
             current_demographic.add_data('estimate', val)
@@ -129,7 +129,7 @@ def create_sample_model(self):
                     elif field == 'sample_percent_male':
                         # Remove % character
                         val_str = str(val)
-                        if re.search('\%',val_str):
+                        if re.search(r'\%',val_str):
                             val_str = re.sub(r'\%', r'', val_str)
                             val_str = re.sub(r' ', r'', val_str)
                             val = float(val_str)

diff --git a/curation/template_parser.py b/curation/template_parser.py
@@ -201,12 +201,17 @@ def extract_samples(self):
             sample_keys = sample_data.data.keys()
             if 'sample_number' not in sample_keys:
                 if 'source_GWAS_catalog' in sample_keys:
-                    gwas_study = get_gwas_study(sample_data.data['source_GWAS_catalog'])
+                    spreadsheet_cohorts = []
+                    if 'cohorts' in sample_keys:
+                        spreadsheet_cohorts = sample_data.data['cohorts']
+                    gwas_study = self.get_gwas_study(sample_data.data['source_GWAS_catalog'],spreadsheet_cohorts,spreadsheet_name)
                     if gwas_study:
                         for gwas_ancestry in gwas_study:
                             c_sample = SampleData(spreadsheet_name)
+                            # Spreadsheet sample/cohort data
                             for col, entry in sample_data.data.items():
                                 c_sample.add_data(col, entry)
+                            # GWAS Catalog sample/cohort data
                             for field, val in gwas_ancestry.items():
                                 c_sample.add_data(field, val)
                             self.update_report(c_sample)
@@ -289,6 +294,99 @@ def get_sample_data(self, sample_info, current_schema, spreadsheet_name, samples
         return sample_data
 
 
+    def get_gwas_study(self,gcst_id:str,spreadsheet_cohorts:list,spreadsheet_name:str) -> dict:
+        """
+        Get the GWAS Study information related to the PGS sample.
+        Check that all the required data is available
+        > Parameter:
+            - gcst_id: GWAS Study ID (e.g. GCST010127)
+            - spreadsheet_cohorts: list of CohortData objects for the current sample, collected from the spreadsheet
+            - spreadsheet_name: Spreadsheet name for report (e.g. Sample Descriptions)
+        > Return: list of dictionnaries (1 per ancestry)
+        """
+        study_data = []
+        gwas_rest_url = 'https://www.ebi.ac.uk/gwas/rest/api/studies/'
+        response = requests.get(f'{gwas_rest_url}{gcst_id}')
+
+        if not response:
+            return study_data
+        response_data = response.json()
+        if response_data:
+            # List the cohorts present in the spreadsheet for this sample
+            spreadsheet_cohorts_names = []
+            if spreadsheet_cohorts:
+                spreadsheet_cohorts_names = [x.name.upper() for x in spreadsheet_cohorts]
+
+            try:
+                source_PMID = response_data['publicationInfo']['pubmedId']
+                # Update the Cohorts list found in the cohort column of the spreadsheet by
+                # adding the list of cohorts from the GWAS study (if the list is present)
+                cohorts_list = spreadsheet_cohorts.copy()
+                if 'cohort' in response_data.keys():
+                    cohorts = response_data['cohort'].split('|')
+                    for cohort in cohorts:
+                        cohort_id = cohort.upper()
+                        # Check if cohort in list of cohort references
+                        # and if the cohort is already in the list provided by the author
+                        if cohort_id in self.parsed_cohorts:
+                            if cohort_id not in spreadsheet_cohorts_names:
+                                cohorts_list.append(self.parsed_cohorts[cohort_id])
+                        else:
+                            self.report_error(spreadsheet_name, f'Error: the GWAS Catalog sample cohort "{cohort}" cannot be found in the Cohort Refr. spreadsheet')
+                    # Print a message if the list of Cohorts from the spreadsheet and from GWAS Catalog (REST API) have been merged.
+                    if spreadsheet_cohorts and len(spreadsheet_cohorts) != len(cohorts_list):
+                        msg = f'''GWAS study {gcst_id} -> the list of cohorts from the spreadsheet has been merged with the one from GWAS.
+                        \t- Spreadsheet list: {', '.join(sorted(spreadsheet_cohorts_names))}
+                        \t+ Merged GWAS list: {', '.join(sorted([x.name.upper() for x in cohorts_list]))}'''
+                        self.report_warning(spreadsheet_name, msg)
+
+                # Ancestry information
+                for ancestry in response_data['ancestries']:
+
+                    if ancestry['type'] != 'initial':
+                        continue
+
+                    ancestry_data = { 'source_PMID': source_PMID }
+                    # Add cohorts list
+                    if cohorts_list:
+                        ancestry_data['cohorts'] = cohorts_list
+                    ancestry_data['sample_number'] = ancestry['numberOfIndividuals']
+
+                    # ancestry_broad
+                    for ancestralGroup in ancestry['ancestralGroups']:
+                        if not 'ancestry_broad' in ancestry_data:
+                            ancestry_data['ancestry_broad'] = ''
+                        else:
+                            ancestry_data['ancestry_broad'] += ','
+                        ancestry_data['ancestry_broad'] += ancestralGroup['ancestralGroup']
+
+                    # ancestry_free
+                    for countryOfOrigin in ancestry['countryOfOrigin']:
+                        if countryOfOrigin['countryName'] != 'NR':
+                            if not 'ancestry_free' in ancestry_data:
+                                ancestry_data['ancestry_free'] = ''
+                            else:
+                                ancestry_data['ancestry_free'] += ','
+                            ancestry_data['ancestry_free'] += countryOfOrigin['countryName']
+
+                    # ancestry_country
+                    for countryOfRecruitment in ancestry['countryOfRecruitment']:
+                        if countryOfRecruitment['countryName'] != 'NR':
+                            if not 'ancestry_country' in ancestry_data:
+                                ancestry_data['ancestry_country'] = ''
+                            else:
+                                ancestry_data['ancestry_country'] += ','
+                            ancestry_data['ancestry_country'] += countryOfRecruitment['countryName']
+                    # ancestry_additional
+                    # Not found in the REST API
+
+                    study_data.append(ancestry_data)
+            except:
+                print(f'Error: can\'t fetch GWAS results for {gcst_id}')
+        return study_data
+
+
+
     def get_model_field_from_schema(self, col, current_schema):
         '''
         Retrieve the model and field from the Template, that corresponds to the current spreadsheet column.
@@ -370,66 +468,6 @@ def has_report_info(self):
 #  Independent methods  #
 #=======================#
 
-def get_gwas_study(gcst_id):
-    """
-    Get the GWAS Study information related to the PGS sample.
-    Check that all the required data is available
-    > Parameter:
-        - gcst_id: GWAS Study ID (e.g. GCST010127)
-    > Return: list of dictionnaries (1 per ancestry)
-    """
-    study_data = []
-    gwas_rest_url = 'https://www.ebi.ac.uk/gwas/rest/api/studies/'
-    response = requests.get(f'{gwas_rest_url}{gcst_id}')
-
-    if not response:
-        return study_data
-    response_data = response.json()
-    if response_data:
-        try:
-            source_PMID = response_data['publicationInfo']['pubmedId']
-            for ancestry in response_data['ancestries']:
-
-                if ancestry['type'] != 'initial':
-                    continue
-
-                ancestry_data = { 'source_PMID': source_PMID }
-                ancestry_data['sample_number'] = ancestry['numberOfIndividuals']
-
-                # ancestry_broad
-                for ancestralGroup in ancestry['ancestralGroups']:
-                    if not 'ancestry_broad' in ancestry_data:
-                        ancestry_data['ancestry_broad'] = ''
-                    else:
-                        ancestry_data['ancestry_broad'] += ','
-                    ancestry_data['ancestry_broad'] += ancestralGroup['ancestralGroup']
-
-                # ancestry_free
-                for countryOfOrigin in ancestry['countryOfOrigin']:
-                    if countryOfOrigin['countryName'] != 'NR':
-                        if not 'ancestry_free' in ancestry_data:
-                            ancestry_data['ancestry_free'] = ''
-                        else:
-                            ancestry_data['ancestry_free'] += ','
-                        ancestry_data['ancestry_free'] += countryOfOrigin['countryName']
-
-                # ancestry_country
-                for countryOfRecruitment in ancestry['countryOfRecruitment']:
-                    if countryOfRecruitment['countryName'] != 'NR':
-                        if not 'ancestry_country' in ancestry_data:
-                            ancestry_data['ancestry_country'] = ''
-                        else:
-                            ancestry_data['ancestry_country'] += ','
-                        ancestry_data['ancestry_country'] += countryOfRecruitment['countryName']
-                # ancestry_additional
-                # Not found in the REST API
-
-                study_data.append(ancestry_data)
-        except:
-            print(f'Error: can\'t fetch GWAS results for {gcst_id}')
-    return study_data
-
-
 def next_PSS_num():
     r = SampleSet.objects.last()
     if r == None:

diff --git a/release/scripts/UpdateGwasStudies.py b/release/scripts/UpdateGwasStudies.py
@@ -1,7 +1,5 @@
 import requests
-from catalog.models import Sample, Score
-from pgs_web import constants
-
+from catalog.models import Sample, Score, Cohort
 
 
 class UpdateGwasStudies:
@@ -16,15 +14,15 @@ def __init__(self,verbose=None):
         self.verbose = verbose
 
 
-    def get_gwas_info(self,sample):
+    def get_gwas_info(self,sample:Sample) -> dict:
         """
         Get the GWAS Study information related to the PGS sample.
         Check that all the required data is available
         > Parameter:
-            - gcst_id: GWAS Study ID (e.g. GCST010127)
-        > Return: list of dictionnaries (1 per ancestry)
+            - sample: instance of a Sample model
+        > Return: dictionary (cohorts and ancestries)
         """
-        study_data = []
+        study_data = { "ancestries": [] }
         gcst_id = sample.source_GWAS_catalog
         response = requests.get(f'{self.gwas_rest_url}{gcst_id}')
 
@@ -37,6 +35,26 @@ def get_gwas_info(self,sample):
         if response_data:
             try:
                 source_PMID = response_data['publicationInfo']['pubmedId']
+
+                # Create list of cohorts if it exists in the GWAS study
+                # This override the Cohorts found previously in the cohort column in the spreadsheet
+                cohorts_list = []
+                if 'cohort' in response_data.keys():
+                    cohorts = response_data['cohort'].split('|')
+                    for cohort in cohorts:
+                        cohort_id = cohort.upper()
+                        try:
+                            cohort_model = Cohort.objects.get(name_short__iexact=cohort_id)
+                            cohorts_list.append(cohort_model)
+                        except Cohort.DoesNotExist:
+                            print(f"New cohort found: {cohort_id}")
+                            cohort_model = Cohort(name_short=cohort_id,name_full=cohort_id)
+                            cohort_model.save()
+                            cohorts_list.append(cohort_model)
+                    if cohorts_list:
+                        study_data['cohorts'] = cohorts_list
+
+                # Ancestries
                 for ancestry in response_data['ancestries']:
 
                     if ancestry['type'] != 'initial':
@@ -70,12 +88,12 @@ def get_gwas_info(self,sample):
                             else:
                                 ancestry_data['ancestry_country'] += self.country_sep
                             ancestry_data['ancestry_country'] += countryOfRecruitment['countryName']
-                    study_data.append(ancestry_data)
+                    study_data["ancestries"].append(ancestry_data)
 
-                if study_data:
-                    print(f'\t{len(study_data)} distinct ancestries')
+                if study_data["ancestries"]:
+                    print(f'\t{len(study_data["ancestries"])} distinct ancestries')
                     if self.verbose:
-                        for anc in study_data:
+                        for anc in study_data["ancestries"]:
                             print(f'\t{anc}')
                 else:
                     print("\tNo ancestry")
@@ -90,7 +108,12 @@ def update_studies(self):
         for sample in self.samples:
             gwas_study = self.get_gwas_info(sample)
             new_samples = []
-            for gwas_ancestry in gwas_study:
+            cohorts_list = []
+            # List of cohorts
+            if 'cohorts' in gwas_study.keys():
+                cohorts_list = gwas_study['cohorts']
+            # List of ancestry data
+            for gwas_ancestry in gwas_study['ancestries']:
                 new_sample = Sample()
                 new_sample.source_GWAS_catalog = sample.source_GWAS_catalog
                 for field, val in gwas_ancestry.items():
@@ -99,11 +122,32 @@ def update_studies(self):
                     setattr(new_sample, field, val)
                 new_sample.save()
 
-                # Cohorts - need to be added once the Sample object as been saved,
-                # i.e. when the Sample `id` has been created
-                if sample.cohorts:
-                    for cohort in sample.cohorts.all():
-                        new_sample.cohorts.add(cohort)
+                # Cohorts data
+                if cohorts_list or sample.cohorts:
+                    # Use the list of cohorts from the GWAS study (if available)
+                    # Update the list of cohorts from the existing sample if new cohorts are found in the GWAS study
+                    if cohorts_list:
+                        new_sample.cohorts.set(cohorts_list)
+                        # Print a message if the 2 list of cohorts (old & new) are different
+                        if sample.cohorts:
+                            new_set = sorted([x.name_short.upper() for x in cohorts_list])
+
+                            old_set_string = ', '.join(sorted([x.name_short.upper() for x in sample.cohorts.all()]))
+                            new_set_string = ', '.join(new_set)
+                            if old_set_string != new_set_string:
+                                # Add cohorts which are already associated to the sample in the database, but not in the GWAS study
+                                for sample_cohort in sample.cohorts.all():
+                                    if sample_cohort.name_short.upper() not in new_set:
+                                        new_sample.cohorts.add(sample_cohort)
+                                print(f"\t/!\ Replacing cohorts list:")
+                                print(f"\t  - Old set: {old_set_string}")
+                                print(f"\t  + New set: {', '.join(sorted([x.name_short.upper() for x in new_sample.cohorts.all()]))}")
+                    # Copy the list of cohorts from the existing sample.
+                    # Need to be added once the new Sample object as been saved,
+                    # i.e. when the Sample `id` has been created
+                    elif sample.cohorts:
+                        for cohort in sample.cohorts.all():
+                            new_sample.cohorts.add(cohort)
                     new_sample.save()
 
                 new_samples.append(new_sample)