From 286ce3a0be14453c591cd1aba788859e788e3255 Mon Sep 17 00:00:00 2001
From: Yannick Mahlich <yannick.mahlich@pnnl.gov>
Date: Wed, 7 May 2025 11:50:11 -0700
Subject: [PATCH 1/2] slight refactoring of file handeling in Dataset.load() in
 preparation for gene table fix

---
 coderdata/dataset/dataset.py | 60 +++++++++++++++++++++++++++---------
 1 file changed, 45 insertions(+), 15 deletions(-)

diff --git a/coderdata/dataset/dataset.py b/coderdata/dataset/dataset.py
index a09fa830..838970d2 100644
--- a/coderdata/dataset/dataset.py
+++ b/coderdata/dataset/dataset.py
@@ -470,6 +470,21 @@ def load(
         _description_
     """
 
+    data_types_to_load = (
+        'transcriptomics',
+        'proteomics',
+        'mutations',
+        'copy_number',
+        'samples',
+        'drugs',
+        'drug_descriptors',
+        'mirna',
+        'experiments',
+        'methylation',
+        'metabolomics',
+        'genes',
+    )
+
     if type(local_path) is not Path:
         try:
             local_path = Path(local_path)
@@ -487,30 +502,45 @@ def load(
         dataset = Dataset(name)
         accepted_file_endings = ('.csv', '.tsv', '.csv.gz', '.tsv.gz')
         print(f"Importing raw data ...", file=sys.stderr)
-        for child in local_path.iterdir():
-            if child.name in ["genes.csv", "genes.csv.gz"]:
+        
+        # generating the file list that contains all files that need to 
+        # be imported based on the Dataset name
+        files = {}
+        for p in local_path.glob(f'{name}_*'):
+            if p.name.endswith(accepted_file_endings) and p.is_file():
+                dataset_type = p.name[len(name)+1:].split('.')[0]
+                files[dataset_type] = p
+        for p in local_path.glob(f'genes*'):
+            if p.name.endswith(accepted_file_endings) and p.is_file():
+                files['genes'] = p
+
+        for dataset_type in data_types_to_load:
+            if dataset_type not in files:
                 print(
-                    f"Importing 'genes' from {child} ...",
-                    end=' ',
+                    f"'{dataset_type}' not available for {name}",
+                    end='\n',
                     file=sys.stderr
                     )
-                dataset.genes = _load_file(child)
-                print("DONE", file=sys.stderr)
-
-            if (
-                child.name.startswith(name)
-                and child.name.endswith(accepted_file_endings)
-                ):
-
-                dataset_type = child.name[len(name)+1:].split('.')[0]
+                continue
+            file = files[dataset_type]
+            if dataset_type != 'genes':
                 print(
-                    f"Importing '{dataset_type}' from {child} ...",
+                    f"Importing '{dataset_type}' from {file} ...",
                     end=' ',
                     file=sys.stderr
                     )
                 if hasattr(dataset, dataset_type):
-                    setattr(dataset, dataset_type, _load_file(child))
+                    setattr(dataset, dataset_type, _load_file(file))
                     print("DONE", file=sys.stderr)
+            else:
+                print(
+                    f"Importing 'genes' from {file} ...",
+                    end=' ',
+                    file=sys.stderr
+                    )
+                dataset.genes = _load_file(file)
+                print("DONE", file=sys.stderr)
+
         print(f"Importing raw data ... DONE", file=sys.stderr)
         return dataset
 

From d6532e0e5499e26f1283ed1524876aba85674728 Mon Sep 17 00:00:00 2001
From: Yannick Mahlich <yannick.mahlich@pnnl.gov>
Date: Wed, 7 May 2025 13:02:13 -0700
Subject: [PATCH 2/2] added logic to subset genes table to only contain those
 genes that are associated with cancer dataset of interest

---
 coderdata/dataset/dataset.py | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/coderdata/dataset/dataset.py b/coderdata/dataset/dataset.py
index 838970d2..2a9dc6b2 100644
--- a/coderdata/dataset/dataset.py
+++ b/coderdata/dataset/dataset.py
@@ -533,12 +533,30 @@ def load(
                     setattr(dataset, dataset_type, _load_file(file))
                     print("DONE", file=sys.stderr)
             else:
+                '''
+                The genes dataset available in the online repository is
+                universal and contains information on genes of all 
+                datasets. To that end it needs to be subsetted to only
+                those genes that are associate with a specific cancer
+                dataset.
+                '''
                 print(
                     f"Importing 'genes' from {file} ...",
                     end=' ',
                     file=sys.stderr
                     )
                 dataset.genes = _load_file(file)
+
+                entrez_ids = set()
+                for dataset_type in ('transcriptomics', 'proteomics',
+                                     'mutations', 'copy_number'):
+                    if getattr(dataset, dataset_type) is not None:
+                        entrez_ids.update(list(
+                           getattr(dataset, dataset_type)['entrez_id'].unique()
+                        ))                
+                dataset.genes = dataset.genes[
+                    dataset.genes['entrez_id'].isin(entrez_ids)
+                    ]
                 print("DONE", file=sys.stderr)
 
         print(f"Importing raw data ... DONE", file=sys.stderr)