Merge pull request #13 from bartongroup/refactoring

Refactoring
bartongroup · Sep 19, 2017 · 7c17572 · 7c17572
2 parents 92973e5 + 7e4f556
commit 7c17572
Show file tree

Hide file tree

Showing 9 changed files with 86 additions and 140 deletions.
diff --git a/prointvar/arpeggio.py b/prointvar/arpeggio.py
@@ -39,6 +39,8 @@
 from prointvar.utils import lazy_file_remover
 from prointvar.utils import row_selector
 from prointvar.utils import string_split
+from prointvar.utils import constrain_column_types
+from prointvar.utils import exclude_columns
 from prointvar.library import arpeggio_types
 from prointvar.library import arpeggio_col_renames
 
@@ -97,22 +99,11 @@ def parse_arpeggio_from_file(inputfile, excluded=(), add_res_split=True,
         table = add_special_cont_types(inputfile, table)
         logger.info("Parsed special contact-types...")
 
-    if excluded is not None:
-        assert type(excluded) is tuple
-        try:
-            table = table.drop(list(excluded), axis=1)
-        except ValueError:
-            # most likely theses are not in there
-            pass
+    # excluding columns
+    table = exclude_columns(table, excluded=excluded)
 
     # enforce some specific column types
-    for col in table:
-        if col in arpeggio_types:
-            try:
-                table[col] = table[col].astype(arpeggio_types[col])
-            except ValueError:
-                # there are some NaNs in there
-                pass
+    table = constrain_column_types(table, arpeggio_types)
 
     if table.empty:
         raise ValueError('{} resulted in an empty DataFrame...'.format(inputfile))
@@ -184,14 +175,8 @@ def parse_arpeggio_spec_from_file(inputfile, excluded=(), add_res_split=True,
     if add_res_split:
         table = add_arpeggio_res_split(table)
 
-    if excluded is not None:
-        excluded = tuple([k for k in excluded if k in header])
-        assert type(excluded) is tuple
-        try:
-            table = table.drop(list(excluded), axis=1)
-        except ValueError:
-            # most likely theses are not in there
-            pass
+    # excluding columns
+    table = exclude_columns(table, excluded=excluded)
 
     if table.empty:
         raise ValueError('{} resulted in an empty DataFrame...'.format(inputfile))

diff --git a/prointvar/dssp.py b/prointvar/dssp.py
@@ -24,6 +24,8 @@
 from prointvar.utils import get_rsa_class
 from prointvar.utils import row_selector
 from prointvar.utils import lazy_file_remover
+from prointvar.utils import constrain_column_types
+from prointvar.utils import exclude_columns
 from prointvar.library import dssp_types
 
 from prointvar.config import config
@@ -173,22 +175,11 @@ def parse_dssp_from_file(inputfile, excluded=(), add_full_chain=True, add_ss_red
         table['LINE'] = table.index + 1
         logger.info("DSSP reset residue number...")
 
-    if excluded is not None:
-        assert type(excluded) is tuple
-        try:
-            table = table.drop(list(excluded), axis=1)
-        except ValueError:
-            # most likely theses are not in there
-            pass
+    # excluding columns
+    table = exclude_columns(table, excluded=excluded)
 
     # enforce some specific column types
-    for col in table:
-        if col in dssp_types:
-            try:
-                table[col] = table[col].astype(dssp_types[col])
-            except ValueError:
-                # there are some NaNs in there
-                pass
+    table = constrain_column_types(table, dssp_types)
 
     if table.empty:
         raise ValueError('{} resulted in an empty DataFrame...'.format(inputfile))

diff --git a/prointvar/hbplus.py b/prointvar/hbplus.py
@@ -19,6 +19,8 @@
 
 from prointvar.utils import row_selector
 from prointvar.utils import lazy_file_remover
+from prointvar.utils import constrain_column_types
+from prointvar.utils import exclude_columns
 from prointvar.library import hbplus_types
 
 from prointvar.config import config
@@ -88,22 +90,11 @@ def fix_res_id(data, key):
     table.INSCODE_D[table.INSCODE_D == "-"] = "?"
     table.INSCODE_A[table.INSCODE_A == "-"] = "?"
 
-    if excluded is not None:
-        assert type(excluded) is tuple
-        try:
-            table = table.drop(list(excluded), axis=1)
-        except ValueError:
-            # most likely theses are not in there
-            pass
+    # excluding columns
+    table = exclude_columns(table, excluded=excluded)
 
     # enforce some specific column types
-    for col in table:
-        if col in hbplus_types:
-            try:
-                table[col] = table[col].astype(hbplus_types[col])
-            except ValueError:
-                # there are some NaNs in there
-                pass
+    table = constrain_column_types(table, hbplus_types)
 
     if table.empty:
         raise ValueError('{} resulted in an empty DataFrame...'.format(inputfile))

diff --git a/prointvar/msas.py b/prointvar/msas.py
@@ -20,6 +20,8 @@
 from Bio import AlignIO
 
 from prointvar.fetchers import fetch_uniprot_id_from_name
+from prointvar.utils import constrain_column_types
+from prointvar.utils import exclude_columns
 
 logger = logging.getLogger("prointvar")
 
@@ -82,22 +84,11 @@ def parse_msa_sequences_from_file(inputfile, excluded=(), get_uniprot_id=False,
     table = pd.DataFrame(rows)
 
     # excluding columns
-    if excluded is not None:
-        assert type(excluded) is tuple
-        try:
-            table = table.drop(list(excluded), axis=1)
-        except ValueError:
-            # most likely theses are not in there
-            pass
+    table = exclude_columns(table, excluded=excluded)
 
     # enforce some specific column types
     msa_types = {key: str for key in list(table) if key != 'Start' and key != 'End'}
-    for col in table:
-        try:
-            table[col] = table[col].astype(msa_types[col])
-        except (ValueError, KeyError):
-            # there are some NaNs in there
-            pass
+    table = constrain_column_types(table, msa_types)
 
     if table.empty:
         raise ValueError('{} resulted in an empty DataFrame...'.format(inputfile))

diff --git a/prointvar/pdbx.py b/prointvar/pdbx.py
@@ -25,6 +25,8 @@
 from prointvar.utils import string_split
 from prointvar.utils import get_new_pro_ids
 from prointvar.utils import check_sequence
+from prointvar.utils import constrain_column_types
+from prointvar.utils import exclude_columns
 from prointvar.library import mmcif_types
 from prointvar.library import aa_default_atoms
 from prointvar.library import aa_codes_3to1_extended
@@ -94,13 +96,7 @@ def parse_mmcif_atoms_from_file(inputfile, excluded=(), add_res_full=True,
                           keep_default_na=False)
 
     # excluding columns
-    if excluded is not None:
-        assert type(excluded) is tuple
-        try:
-            table = table.drop(list(excluded), axis=1)
-        except ValueError:
-            # most likely theses are not in there
-            pass
+    table = exclude_columns(table, excluded=excluded)
 
     # if only first model (>1 in NMR structures)
     if first_model:
@@ -144,13 +140,7 @@ def parse_mmcif_atoms_from_file(inputfile, excluded=(), add_res_full=True,
         logger.info("PDBx reset atom numbers...")
 
     # enforce some specific column types
-    for col in table:
-        if col in mmcif_types:
-            try:
-                table[col] = table[col].astype(mmcif_types[col])
-            except ValueError:
-                # there are some NaNs in there
-                pass
+    table = constrain_column_types(table, mmcif_types)
 
     if table.empty:
         raise ValueError('{} resulted in an empty DataFrame...'.format(inputfile))
@@ -226,13 +216,7 @@ def parse_pdb_atoms_from_file(inputfile, excluded=(), add_contacts=False,
                         compression=None, converters=all_str, keep_default_na=False)
 
     # excluding columns
-    if excluded is not None:
-        assert type(excluded) is tuple
-        try:
-            table = table.drop(list(excluded), axis=1)
-        except ValueError:
-            # most likely theses are not in there
-            pass
+    table = exclude_columns(table, excluded=excluded)
 
     # if only first model (>1 in NMR structures)
     if first_model:
@@ -279,13 +263,7 @@ def parse_pdb_atoms_from_file(inputfile, excluded=(), add_contacts=False,
         logger.info("PDBx reset atom numbers...")
 
     # enforce some specific column types
-    for col in table:
-        if col in mmcif_types:
-            try:
-                table[col] = table[col].astype(mmcif_types[col])
-            except ValueError:
-                # there are some NaNs in there
-                pass
+    table = constrain_column_types(table, mmcif_types)
 
     if table.empty:
         raise ValueError('{} resulted in an empty DataFrame...'.format(inputfile))

diff --git a/prointvar/sifts.py b/prointvar/sifts.py
@@ -18,6 +18,7 @@
 from collections import OrderedDict
 
 from prointvar.utils import row_selector
+from prointvar.utils import constrain_column_types
 from prointvar.library import sifts_types
 
 logger = logging.getLogger("prointvar")
@@ -210,13 +211,7 @@ def parse_sifts_residues_from_file(inputfile, excluded=(),
     table = pd.DataFrame(rows)
 
     # enforce some specific column types
-    for col in table:
-        if col in sifts_types:
-            try:
-                table[col] = table[col].astype(sifts_types[col])
-            except ValueError:
-                # there are some NaNs in there
-                pass
+    table = constrain_column_types(table, sifts_types)
 
     for c in list(table):
         if '_regionId' in c:

diff --git a/prointvar/stamp.py b/prointvar/stamp.py
@@ -22,6 +22,8 @@
 
 from prointvar.pdbx import PDBXwriter
 
+from prointvar.utils import constrain_column_types
+from prointvar.utils import exclude_columns
 from prointvar.library import stamp_types
 
 from prointvar.config import config
@@ -228,22 +230,10 @@ def parse_stamp_scan_scores_from_file(inputfile, excluded=()):
                           keep_default_na=False)
 
     # excluding columns
-    if excluded is not None:
-        assert type(excluded) is tuple
-        try:
-            table = table.drop(list(excluded), axis=1)
-        except ValueError:
-            # most likely theses are not in there
-            pass
+    table = exclude_columns(table, excluded=excluded)
 
     # enforce some specific column types
-    for col in table:
-        if col in stamp_types:
-            try:
-                table[col] = table[col].astype(stamp_types[col])
-            except ValueError:
-                # there are some NaNs in there
-                pass
+    table = constrain_column_types(table, stamp_types)
 
     if table.empty:
         raise ValueError('{} resulted in an empty DataFrame...'.format(inputfile))

diff --git a/prointvar/utils.py b/prointvar/utils.py
@@ -679,5 +679,50 @@ def get_start_end_ranges_consecutive_ints(data):
     return tuple(starts), tuple(ends)
 
 
+def constrain_column_types(data, dictionary, nan_value=None):
+    """
+    Helper method that helps in constrain data types for the
+    various DataFrame columns.
+
+    :param data: pandas DataFrame
+    :param dictionary: (dict) defines common types
+    :param nan_value: optional new value passed to replace NaNs
+    :return: modified pandas DataFrame
+    """
+
+    table = data
+    for col in table:
+        if col in dictionary:
+            try:
+                table[col] = table[col].astype(dictionary[col])
+            except (ValueError, KeyError):
+                # probably there are some NaNs in there
+                pass
+            if table[col].isnull().any().any() and nan_value is not None:
+                table[col] = table[col].fillna(nan_value, axis=1)
+    return table
+
+
+def exclude_columns(data, excluded=()):
+    """
+    Helper method that helps in filtering out columns based
+    on the column name.
+
+    :param data: pandas DataFrame
+    :param excluded: (tuple) optional columns to be excluded
+    :return: modified pandas DataFrame
+    """
+
+    table = data
+    if excluded is not None:
+        assert type(excluded) is tuple
+        try:
+            table = table.drop(list(excluded), axis=1)
+        except ValueError:
+            # most likely theses are not in there
+            pass
+    return table
+
+
 if __name__ == '__main__':
     pass
diff --git a/prointvar/variants.py b/prointvar/variants.py
@@ -28,6 +28,8 @@
 from prointvar.utils import merging_down_by_key
 from prointvar.utils import flatten_nested_structure
 from prointvar.utils import refactor_key_val_singletons
+from prointvar.utils import constrain_column_types
+from prointvar.utils import exclude_columns
 from prointvar.library import uni_ens_var_types
 from prointvar.library import update_ensembl_to_uniprot
 
@@ -58,22 +60,11 @@ def flatten_uniprot_variants_ebi(data, excluded=()):
 
     table = pd.DataFrame(var_rows)
 
-    if excluded is not None:
-        assert type(excluded) is tuple
-        try:
-            table = table.drop(list(excluded), axis=1)
-        except ValueError:
-            # most likely theses are not in there
-            pass
+    # excluding columns
+    table = exclude_columns(table, excluded=excluded)
 
     # enforce some specific column types
-    for col in table:
-        if col in uni_ens_var_types:
-            try:
-                table[col] = table[col].astype(uni_ens_var_types[col])
-            except ValueError:
-                # there are some NaNs in there
-                pass
+    table = constrain_column_types(table, uni_ens_var_types)
 
     # split multi id rows
     table = splitting_up_by_key(table, key='xrefs_id')
@@ -107,22 +98,11 @@ def flatten_ensembl_variants(data, excluded=(), synonymous=True):
     # rename columns
     table.rename(columns=update_ensembl_to_uniprot, inplace=True)
 
-    if excluded is not None:
-        assert type(excluded) is tuple
-        try:
-            table = table.drop(list(excluded), axis=1)
-        except ValueError:
-            # most likely theses are not in there
-            pass
+    # excluding columns
+    table = exclude_columns(table, excluded=excluded)
 
     # enforce some specific column types
-    for col in table:
-        if col in uni_ens_var_types:
-            try:
-                table[col] = table[col].astype(uni_ens_var_types[col])
-            except ValueError:
-                # there are some NaNs in there
-                pass
+    table = constrain_column_types(table, uni_ens_var_types)
 
     # split multi id rows
     table = splitting_up_by_key(table, key='xrefs_id')