Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Synchronize cohorts with the NHGRI GWAS Catalog, when available #394

Merged
merged 9 commits into from
Nov 7, 2024
2 changes: 1 addition & 1 deletion curation/parsers/performance.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ def str2metric(self, field, val):
val = self.replace_non_ascii_chars(field,val)

# Estimate with percentage as unit
if re.match('^\d+\.?\d*\s*\%$',val):
if re.match(r'^\d+\.?\d*\s*\%$',val):
val = val.replace('%','').strip()
current_metric.add_data('estimate', val)
current_metric.add_data('unit', '%')
Expand Down
4 changes: 2 additions & 2 deletions curation/parsers/sample.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ def str2demographic(self, field, val):
- val: data value
Return type: DemographicData object
'''
unit_regex = "([-+]?\d*\.\d+|\d+) ([a-zA-Z]+)"
unit_regex = r"([-+]?\d*\.\d+|\d+) ([a-zA-Z]+)"
current_demographic = DemographicData(field,val,self.spreadsheet_name)
if type(val) == float:
current_demographic.add_data('estimate', val)
Expand Down Expand Up @@ -129,7 +129,7 @@ def create_sample_model(self):
elif field == 'sample_percent_male':
# Remove % character
val_str = str(val)
if re.search('\%',val_str):
if re.search(r'\%',val_str):
val_str = re.sub(r'\%', r'', val_str)
val_str = re.sub(r' ', r'', val_str)
val = float(val_str)
Expand Down
160 changes: 99 additions & 61 deletions curation/template_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -201,12 +201,17 @@ def extract_samples(self):
sample_keys = sample_data.data.keys()
if 'sample_number' not in sample_keys:
if 'source_GWAS_catalog' in sample_keys:
gwas_study = get_gwas_study(sample_data.data['source_GWAS_catalog'])
spreadsheet_cohorts = []
if 'cohorts' in sample_keys:
spreadsheet_cohorts = sample_data.data['cohorts']
gwas_study = self.get_gwas_study(sample_data.data['source_GWAS_catalog'],spreadsheet_cohorts,spreadsheet_name)
if gwas_study:
for gwas_ancestry in gwas_study:
c_sample = SampleData(spreadsheet_name)
# Spreadsheet sample/cohort data
for col, entry in sample_data.data.items():
c_sample.add_data(col, entry)
# GWAS Catalog sample/cohort data
for field, val in gwas_ancestry.items():
c_sample.add_data(field, val)
self.update_report(c_sample)
Expand Down Expand Up @@ -289,6 +294,99 @@ def get_sample_data(self, sample_info, current_schema, spreadsheet_name, samples
return sample_data


def get_gwas_study(self,gcst_id:str,spreadsheet_cohorts:list,spreadsheet_name:str) -> dict:
"""
Get the GWAS Study information related to the PGS sample.
Check that all the required data is available
> Parameter:
- gcst_id: GWAS Study ID (e.g. GCST010127)
- spreadsheet_cohorts: list of CohortData objects for the current sample, collected from the spreadsheet
- spreadsheet_name: Spreadsheet name for report (e.g. Sample Descriptions)
> Return: list of dictionnaries (1 per ancestry)
"""
study_data = []
gwas_rest_url = 'https://www.ebi.ac.uk/gwas/rest/api/studies/'
response = requests.get(f'{gwas_rest_url}{gcst_id}')

if not response:
return study_data
response_data = response.json()
if response_data:
# List the cohorts present in the spreadsheet for this sample
spreadsheet_cohorts_names = []
if spreadsheet_cohorts:
spreadsheet_cohorts_names = [x.name.upper() for x in spreadsheet_cohorts]

try:
source_PMID = response_data['publicationInfo']['pubmedId']
# Update the Cohorts list found in the cohort column of the spreadsheet by
# adding the list of cohorts from the GWAS study (if the list is present)
cohorts_list = spreadsheet_cohorts.copy()
if 'cohort' in response_data.keys():
cohorts = response_data['cohort'].split('|')
for cohort in cohorts:
cohort_id = cohort.upper()
# Check if cohort in list of cohort references
# and if the cohort is already in the list provided by the author
if cohort_id in self.parsed_cohorts:
if cohort_id not in spreadsheet_cohorts_names:
cohorts_list.append(self.parsed_cohorts[cohort_id])
else:
self.report_error(spreadsheet_name, f'Error: the GWAS Catalog sample cohort "{cohort}" cannot be found in the Cohort Refr. spreadsheet')
# Print a message if the list of Cohorts from the spreadsheet and from GWAS Catalog (REST API) have been merged.
if spreadsheet_cohorts and len(spreadsheet_cohorts) != len(cohorts_list):
msg = f'''GWAS study {gcst_id} -> the list of cohorts from the spreadsheet has been merged with the one from GWAS.
\t- Spreadsheet list: {', '.join(sorted(spreadsheet_cohorts_names))}
\t+ Merged GWAS list: {', '.join(sorted([x.name.upper() for x in cohorts_list]))}'''
self.report_warning(spreadsheet_name, msg)

# Ancestry information
for ancestry in response_data['ancestries']:

if ancestry['type'] != 'initial':
continue

ancestry_data = { 'source_PMID': source_PMID }
# Add cohorts list
if cohorts_list:
ancestry_data['cohorts'] = cohorts_list
ancestry_data['sample_number'] = ancestry['numberOfIndividuals']

# ancestry_broad
for ancestralGroup in ancestry['ancestralGroups']:
if not 'ancestry_broad' in ancestry_data:
ancestry_data['ancestry_broad'] = ''
else:
ancestry_data['ancestry_broad'] += ','
ancestry_data['ancestry_broad'] += ancestralGroup['ancestralGroup']

# ancestry_free
for countryOfOrigin in ancestry['countryOfOrigin']:
if countryOfOrigin['countryName'] != 'NR':
if not 'ancestry_free' in ancestry_data:
ancestry_data['ancestry_free'] = ''
else:
ancestry_data['ancestry_free'] += ','
ancestry_data['ancestry_free'] += countryOfOrigin['countryName']

# ancestry_country
for countryOfRecruitment in ancestry['countryOfRecruitment']:
if countryOfRecruitment['countryName'] != 'NR':
if not 'ancestry_country' in ancestry_data:
ancestry_data['ancestry_country'] = ''
else:
ancestry_data['ancestry_country'] += ','
ancestry_data['ancestry_country'] += countryOfRecruitment['countryName']
# ancestry_additional
# Not found in the REST API

study_data.append(ancestry_data)
except:
print(f'Error: can\'t fetch GWAS results for {gcst_id}')
return study_data



def get_model_field_from_schema(self, col, current_schema):
'''
Retrieve the model and field from the Template, that corresponds to the current spreadsheet column.
Expand Down Expand Up @@ -370,66 +468,6 @@ def has_report_info(self):
# Independent methods #
#=======================#

def get_gwas_study(gcst_id):
"""
Get the GWAS Study information related to the PGS sample.
Check that all the required data is available
> Parameter:
- gcst_id: GWAS Study ID (e.g. GCST010127)
> Return: list of dictionnaries (1 per ancestry)
"""
study_data = []
gwas_rest_url = 'https://www.ebi.ac.uk/gwas/rest/api/studies/'
response = requests.get(f'{gwas_rest_url}{gcst_id}')

if not response:
return study_data
response_data = response.json()
if response_data:
try:
source_PMID = response_data['publicationInfo']['pubmedId']
for ancestry in response_data['ancestries']:

if ancestry['type'] != 'initial':
continue

ancestry_data = { 'source_PMID': source_PMID }
ancestry_data['sample_number'] = ancestry['numberOfIndividuals']

# ancestry_broad
for ancestralGroup in ancestry['ancestralGroups']:
if not 'ancestry_broad' in ancestry_data:
ancestry_data['ancestry_broad'] = ''
else:
ancestry_data['ancestry_broad'] += ','
ancestry_data['ancestry_broad'] += ancestralGroup['ancestralGroup']

# ancestry_free
for countryOfOrigin in ancestry['countryOfOrigin']:
if countryOfOrigin['countryName'] != 'NR':
if not 'ancestry_free' in ancestry_data:
ancestry_data['ancestry_free'] = ''
else:
ancestry_data['ancestry_free'] += ','
ancestry_data['ancestry_free'] += countryOfOrigin['countryName']

# ancestry_country
for countryOfRecruitment in ancestry['countryOfRecruitment']:
if countryOfRecruitment['countryName'] != 'NR':
if not 'ancestry_country' in ancestry_data:
ancestry_data['ancestry_country'] = ''
else:
ancestry_data['ancestry_country'] += ','
ancestry_data['ancestry_country'] += countryOfRecruitment['countryName']
# ancestry_additional
# Not found in the REST API

study_data.append(ancestry_data)
except:
print(f'Error: can\'t fetch GWAS results for {gcst_id}')
return study_data


def next_PSS_num():
r = SampleSet.objects.last()
if r == None:
Expand Down
78 changes: 61 additions & 17 deletions release/scripts/UpdateGwasStudies.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
import requests
from catalog.models import Sample, Score
from pgs_web import constants

from catalog.models import Sample, Score, Cohort


class UpdateGwasStudies:
Expand All @@ -16,15 +14,15 @@ def __init__(self,verbose=None):
self.verbose = verbose


def get_gwas_info(self,sample):
def get_gwas_info(self,sample:Sample) -> dict:
"""
Get the GWAS Study information related to the PGS sample.
Check that all the required data is available
> Parameter:
- gcst_id: GWAS Study ID (e.g. GCST010127)
> Return: list of dictionnaries (1 per ancestry)
- sample: instance of a Sample model
> Return: dictionary (cohorts and ancestries)
"""
study_data = []
study_data = { "ancestries": [] }
gcst_id = sample.source_GWAS_catalog
response = requests.get(f'{self.gwas_rest_url}{gcst_id}')

Expand All @@ -37,6 +35,26 @@ def get_gwas_info(self,sample):
if response_data:
try:
source_PMID = response_data['publicationInfo']['pubmedId']

# Create list of cohorts if it exists in the GWAS study
# This override the Cohorts found previously in the cohort column in the spreadsheet
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we get a warning/message if the spreadsheet cohorts are replaced? (including previous and new values)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah yes, we can add that

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It will look like that:

# GCST00XXXX:
	1 distinct ancestries
	{'source_PMID': '1234567', 'sample_number': 23464, 'ancestry_broad': 'European', 'ancestry_country': 'U.K.'}
	/!\ Replacing cohorts list:
	  - Old set: ACTS, LASA
	  + New set: UKB
	>> SCORE updated: PGS00XXXX

with the new bits:

/!\ Replacing cohorts list:
  - Old set: ACTS, LASA
  + New set: UKB

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It might useful to know if it's just additions or things are missing? It's more likley to be correct if it's adding more annotations, but if it's removing an annotation we should be careful.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If it's using the GCST, it should match what is in the GWAS Catalog ?
At the moment this script overwrite the Sample if the sample_number, sample_cases and sample_controls are NULL, i.e. GCSTs which were not released when the study was imported.

cohorts_list = []
if 'cohort' in response_data.keys():
cohorts = response_data['cohort'].split('|')
for cohort in cohorts:
cohort_id = cohort.upper()
try:
cohort_model = Cohort.objects.get(name_short__iexact=cohort_id)
cohorts_list.append(cohort_model)
except Cohort.DoesNotExist:
print(f"New cohort found: {cohort_id}")
cohort_model = Cohort(name_short=cohort_id,name_full=cohort_id)
cohort_model.save()
cohorts_list.append(cohort_model)
if cohorts_list:
study_data['cohorts'] = cohorts_list

# Ancestries
for ancestry in response_data['ancestries']:

if ancestry['type'] != 'initial':
Expand Down Expand Up @@ -70,12 +88,12 @@ def get_gwas_info(self,sample):
else:
ancestry_data['ancestry_country'] += self.country_sep
ancestry_data['ancestry_country'] += countryOfRecruitment['countryName']
study_data.append(ancestry_data)
study_data["ancestries"].append(ancestry_data)

if study_data:
print(f'\t{len(study_data)} distinct ancestries')
if study_data["ancestries"]:
print(f'\t{len(study_data["ancestries"])} distinct ancestries')
if self.verbose:
for anc in study_data:
for anc in study_data["ancestries"]:
print(f'\t{anc}')
else:
print("\tNo ancestry")
Expand All @@ -90,7 +108,12 @@ def update_studies(self):
for sample in self.samples:
gwas_study = self.get_gwas_info(sample)
new_samples = []
for gwas_ancestry in gwas_study:
cohorts_list = []
# List of cohorts
if 'cohorts' in gwas_study.keys():
cohorts_list = gwas_study['cohorts']
# List of ancestry data
for gwas_ancestry in gwas_study['ancestries']:
new_sample = Sample()
new_sample.source_GWAS_catalog = sample.source_GWAS_catalog
for field, val in gwas_ancestry.items():
Expand All @@ -99,11 +122,32 @@ def update_studies(self):
setattr(new_sample, field, val)
new_sample.save()

# Cohorts - need to be added once the Sample object as been saved,
# i.e. when the Sample `id` has been created
if sample.cohorts:
for cohort in sample.cohorts.all():
new_sample.cohorts.add(cohort)
# Cohorts data
if cohorts_list or sample.cohorts:
# Use the list of cohorts from the GWAS study (if available)
# Update the list of cohorts from the existing sample if new cohorts are found in the GWAS study
if cohorts_list:
new_sample.cohorts.set(cohorts_list)
# Print a message if the 2 list of cohorts (old & new) are different
if sample.cohorts:
new_set = sorted([x.name_short.upper() for x in cohorts_list])

old_set_string = ', '.join(sorted([x.name_short.upper() for x in sample.cohorts.all()]))
new_set_string = ', '.join(new_set)
if old_set_string != new_set_string:
# Add cohorts which are already associated to the sample in the database, but not in the GWAS study
for sample_cohort in sample.cohorts.all():
if sample_cohort.name_short.upper() not in new_set:
new_sample.cohorts.add(sample_cohort)
print(f"\t/!\ Replacing cohorts list:")
print(f"\t - Old set: {old_set_string}")
print(f"\t + New set: {', '.join(sorted([x.name_short.upper() for x in new_sample.cohorts.all()]))}")
# Copy the list of cohorts from the existing sample.
# Need to be added once the new Sample object as been saved,
# i.e. when the Sample `id` has been created
elif sample.cohorts:
for cohort in sample.cohorts.all():
new_sample.cohorts.add(cohort)
new_sample.save()

new_samples.append(new_sample)
Expand Down