Refactored the methods to use the helper method 'constrain_column_typ…

…es'.
bartongroup · Sep 19, 2017 · c332ad4 · c332ad4
1 parent eac69e0
commit c332ad4
Show file tree

Hide file tree

Showing 8 changed files with 18 additions and 69 deletions.
diff --git a/prointvar/arpeggio.py b/prointvar/arpeggio.py
@@ -39,6 +39,7 @@
 from prointvar.utils import lazy_file_remover
 from prointvar.utils import row_selector
 from prointvar.utils import string_split
+from prointvar.utils import constrain_column_types
 from prointvar.library import arpeggio_types
 from prointvar.library import arpeggio_col_renames
 
@@ -106,13 +107,7 @@ def parse_arpeggio_from_file(inputfile, excluded=(), add_res_split=True,
             pass
 
     # enforce some specific column types
-    for col in table:
-        if col in arpeggio_types:
-            try:
-                table[col] = table[col].astype(arpeggio_types[col])
-            except ValueError:
-                # there are some NaNs in there
-                pass
+    table = constrain_column_types(table, arpeggio_types)
 
     if table.empty:
         raise ValueError('{} resulted in an empty DataFrame...'.format(inputfile))

diff --git a/prointvar/dssp.py b/prointvar/dssp.py
@@ -24,6 +24,7 @@
 from prointvar.utils import get_rsa_class
 from prointvar.utils import row_selector
 from prointvar.utils import lazy_file_remover
+from prointvar.utils import constrain_column_types
 from prointvar.library import dssp_types
 
 from prointvar.config import config
@@ -182,13 +183,7 @@ def parse_dssp_from_file(inputfile, excluded=(), add_full_chain=True, add_ss_red
             pass
 
     # enforce some specific column types
-    for col in table:
-        if col in dssp_types:
-            try:
-                table[col] = table[col].astype(dssp_types[col])
-            except ValueError:
-                # there are some NaNs in there
-                pass
+    table = constrain_column_types(table, dssp_types)
 
     if table.empty:
         raise ValueError('{} resulted in an empty DataFrame...'.format(inputfile))

diff --git a/prointvar/hbplus.py b/prointvar/hbplus.py
@@ -19,6 +19,7 @@
 
 from prointvar.utils import row_selector
 from prointvar.utils import lazy_file_remover
+from prointvar.utils import constrain_column_types
 from prointvar.library import hbplus_types
 
 from prointvar.config import config
@@ -97,13 +98,7 @@ def fix_res_id(data, key):
             pass
 
     # enforce some specific column types
-    for col in table:
-        if col in hbplus_types:
-            try:
-                table[col] = table[col].astype(hbplus_types[col])
-            except ValueError:
-                # there are some NaNs in there
-                pass
+    table = constrain_column_types(table, hbplus_types)
 
     if table.empty:
         raise ValueError('{} resulted in an empty DataFrame...'.format(inputfile))

diff --git a/prointvar/msas.py b/prointvar/msas.py
@@ -20,6 +20,7 @@
 from Bio import AlignIO
 
 from prointvar.fetchers import fetch_uniprot_id_from_name
+from prointvar.utils import constrain_column_types
 
 logger = logging.getLogger("prointvar")
 
@@ -92,12 +93,7 @@ def parse_msa_sequences_from_file(inputfile, excluded=(), get_uniprot_id=False,
 
     # enforce some specific column types
     msa_types = {key: str for key in list(table) if key != 'Start' and key != 'End'}
-    for col in table:
-        try:
-            table[col] = table[col].astype(msa_types[col])
-        except (ValueError, KeyError):
-            # there are some NaNs in there
-            pass
+    table = constrain_column_types(table, msa_types)
 
     if table.empty:
         raise ValueError('{} resulted in an empty DataFrame...'.format(inputfile))

diff --git a/prointvar/pdbx.py b/prointvar/pdbx.py
@@ -25,6 +25,7 @@
 from prointvar.utils import string_split
 from prointvar.utils import get_new_pro_ids
 from prointvar.utils import check_sequence
+from prointvar.utils import constrain_column_types
 from prointvar.library import mmcif_types
 from prointvar.library import aa_default_atoms
 from prointvar.library import aa_codes_3to1_extended
@@ -144,13 +145,7 @@ def parse_mmcif_atoms_from_file(inputfile, excluded=(), add_res_full=True,
         logger.info("PDBx reset atom numbers...")
 
     # enforce some specific column types
-    for col in table:
-        if col in mmcif_types:
-            try:
-                table[col] = table[col].astype(mmcif_types[col])
-            except ValueError:
-                # there are some NaNs in there
-                pass
+    table = constrain_column_types(table, mmcif_types)
 
     if table.empty:
         raise ValueError('{} resulted in an empty DataFrame...'.format(inputfile))
@@ -279,13 +274,7 @@ def parse_pdb_atoms_from_file(inputfile, excluded=(), add_contacts=False,
         logger.info("PDBx reset atom numbers...")
 
     # enforce some specific column types
-    for col in table:
-        if col in mmcif_types:
-            try:
-                table[col] = table[col].astype(mmcif_types[col])
-            except ValueError:
-                # there are some NaNs in there
-                pass
+    table = constrain_column_types(table, mmcif_types)
 
     if table.empty:
         raise ValueError('{} resulted in an empty DataFrame...'.format(inputfile))

diff --git a/prointvar/sifts.py b/prointvar/sifts.py
@@ -18,6 +18,7 @@
 from collections import OrderedDict
 
 from prointvar.utils import row_selector
+from prointvar.utils import constrain_column_types
 from prointvar.library import sifts_types
 
 logger = logging.getLogger("prointvar")
@@ -210,13 +211,7 @@ def parse_sifts_residues_from_file(inputfile, excluded=(),
     table = pd.DataFrame(rows)
 
     # enforce some specific column types
-    for col in table:
-        if col in sifts_types:
-            try:
-                table[col] = table[col].astype(sifts_types[col])
-            except ValueError:
-                # there are some NaNs in there
-                pass
+    table = constrain_column_types(table, sifts_types)
 
     for c in list(table):
         if '_regionId' in c:

diff --git a/prointvar/stamp.py b/prointvar/stamp.py
@@ -22,6 +22,7 @@
 
 from prointvar.pdbx import PDBXwriter
 
+from prointvar.utils import constrain_column_types
 from prointvar.library import stamp_types
 
 from prointvar.config import config
@@ -237,13 +238,7 @@ def parse_stamp_scan_scores_from_file(inputfile, excluded=()):
             pass
 
     # enforce some specific column types
-    for col in table:
-        if col in stamp_types:
-            try:
-                table[col] = table[col].astype(stamp_types[col])
-            except ValueError:
-                # there are some NaNs in there
-                pass
+    table = constrain_column_types(table, stamp_types)
 
     if table.empty:
         raise ValueError('{} resulted in an empty DataFrame...'.format(inputfile))

diff --git a/prointvar/variants.py b/prointvar/variants.py
@@ -28,6 +28,7 @@
 from prointvar.utils import merging_down_by_key
 from prointvar.utils import flatten_nested_structure
 from prointvar.utils import refactor_key_val_singletons
+from prointvar.utils import constrain_column_types
 from prointvar.library import uni_ens_var_types
 from prointvar.library import update_ensembl_to_uniprot
 
@@ -67,13 +68,7 @@ def flatten_uniprot_variants_ebi(data, excluded=()):
             pass
 
     # enforce some specific column types
-    for col in table:
-        if col in uni_ens_var_types:
-            try:
-                table[col] = table[col].astype(uni_ens_var_types[col])
-            except ValueError:
-                # there are some NaNs in there
-                pass
+    table = constrain_column_types(table, uni_ens_var_types)
 
     # split multi id rows
     table = splitting_up_by_key(table, key='xrefs_id')
@@ -116,13 +111,7 @@ def flatten_ensembl_variants(data, excluded=(), synonymous=True):
             pass
 
     # enforce some specific column types
-    for col in table:
-        if col in uni_ens_var_types:
-            try:
-                table[col] = table[col].astype(uni_ens_var_types[col])
-            except ValueError:
-                # there are some NaNs in there
-                pass
+    table = constrain_column_types(table, uni_ens_var_types)
 
     # split multi id rows
     table = splitting_up_by_key(table, key='xrefs_id')