From 286ce3a0be14453c591cd1aba788859e788e3255 Mon Sep 17 00:00:00 2001 From: Yannick Mahlich Date: Wed, 7 May 2025 11:50:11 -0700 Subject: [PATCH 1/2] slight refactoring of file handeling in Dataset.load() in preparation for gene table fix --- coderdata/dataset/dataset.py | 60 +++++++++++++++++++++++++++--------- 1 file changed, 45 insertions(+), 15 deletions(-) diff --git a/coderdata/dataset/dataset.py b/coderdata/dataset/dataset.py index a09fa830..838970d2 100644 --- a/coderdata/dataset/dataset.py +++ b/coderdata/dataset/dataset.py @@ -470,6 +470,21 @@ def load( _description_ """ + data_types_to_load = ( + 'transcriptomics', + 'proteomics', + 'mutations', + 'copy_number', + 'samples', + 'drugs', + 'drug_descriptors', + 'mirna', + 'experiments', + 'methylation', + 'metabolomics', + 'genes', + ) + if type(local_path) is not Path: try: local_path = Path(local_path) @@ -487,30 +502,45 @@ def load( dataset = Dataset(name) accepted_file_endings = ('.csv', '.tsv', '.csv.gz', '.tsv.gz') print(f"Importing raw data ...", file=sys.stderr) - for child in local_path.iterdir(): - if child.name in ["genes.csv", "genes.csv.gz"]: + + # generating the file list that contains all files that need to + # be imported based on the Dataset name + files = {} + for p in local_path.glob(f'{name}_*'): + if p.name.endswith(accepted_file_endings) and p.is_file(): + dataset_type = p.name[len(name)+1:].split('.')[0] + files[dataset_type] = p + for p in local_path.glob(f'genes*'): + if p.name.endswith(accepted_file_endings) and p.is_file(): + files['genes'] = p + + for dataset_type in data_types_to_load: + if dataset_type not in files: print( - f"Importing 'genes' from {child} ...", - end=' ', + f"'{dataset_type}' not available for {name}", + end='\n', file=sys.stderr ) - dataset.genes = _load_file(child) - print("DONE", file=sys.stderr) - - if ( - child.name.startswith(name) - and child.name.endswith(accepted_file_endings) - ): - - dataset_type = child.name[len(name)+1:].split('.')[0] + continue + file = files[dataset_type] + if dataset_type != 'genes': print( - f"Importing '{dataset_type}' from {child} ...", + f"Importing '{dataset_type}' from {file} ...", end=' ', file=sys.stderr ) if hasattr(dataset, dataset_type): - setattr(dataset, dataset_type, _load_file(child)) + setattr(dataset, dataset_type, _load_file(file)) print("DONE", file=sys.stderr) + else: + print( + f"Importing 'genes' from {file} ...", + end=' ', + file=sys.stderr + ) + dataset.genes = _load_file(file) + print("DONE", file=sys.stderr) + print(f"Importing raw data ... DONE", file=sys.stderr) return dataset From d6532e0e5499e26f1283ed1524876aba85674728 Mon Sep 17 00:00:00 2001 From: Yannick Mahlich Date: Wed, 7 May 2025 13:02:13 -0700 Subject: [PATCH 2/2] added logic to subset genes table to only contain those genes that are associated with cancer dataset of interest --- coderdata/dataset/dataset.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/coderdata/dataset/dataset.py b/coderdata/dataset/dataset.py index 838970d2..2a9dc6b2 100644 --- a/coderdata/dataset/dataset.py +++ b/coderdata/dataset/dataset.py @@ -533,12 +533,30 @@ def load( setattr(dataset, dataset_type, _load_file(file)) print("DONE", file=sys.stderr) else: + ''' + The genes dataset available in the online repository is + universal and contains information on genes of all + datasets. To that end it needs to be subsetted to only + those genes that are associate with a specific cancer + dataset. + ''' print( f"Importing 'genes' from {file} ...", end=' ', file=sys.stderr ) dataset.genes = _load_file(file) + + entrez_ids = set() + for dataset_type in ('transcriptomics', 'proteomics', + 'mutations', 'copy_number'): + if getattr(dataset, dataset_type) is not None: + entrez_ids.update(list( + getattr(dataset, dataset_type)['entrez_id'].unique() + )) + dataset.genes = dataset.genes[ + dataset.genes['entrez_id'].isin(entrez_ids) + ] print("DONE", file=sys.stderr) print(f"Importing raw data ... DONE", file=sys.stderr)