Skip to content

Commit

Permalink
Merge pull request #13 from bartongroup/refactoring
Browse files Browse the repository at this point in the history
Refactoring
  • Loading branch information
biomadeira authored Sep 19, 2017
2 parents 92973e5 + 7e4f556 commit 7c17572
Show file tree
Hide file tree
Showing 9 changed files with 86 additions and 140 deletions.
29 changes: 7 additions & 22 deletions prointvar/arpeggio.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,8 @@
from prointvar.utils import lazy_file_remover
from prointvar.utils import row_selector
from prointvar.utils import string_split
from prointvar.utils import constrain_column_types
from prointvar.utils import exclude_columns
from prointvar.library import arpeggio_types
from prointvar.library import arpeggio_col_renames

Expand Down Expand Up @@ -97,22 +99,11 @@ def parse_arpeggio_from_file(inputfile, excluded=(), add_res_split=True,
table = add_special_cont_types(inputfile, table)
logger.info("Parsed special contact-types...")

if excluded is not None:
assert type(excluded) is tuple
try:
table = table.drop(list(excluded), axis=1)
except ValueError:
# most likely theses are not in there
pass
# excluding columns
table = exclude_columns(table, excluded=excluded)

# enforce some specific column types
for col in table:
if col in arpeggio_types:
try:
table[col] = table[col].astype(arpeggio_types[col])
except ValueError:
# there are some NaNs in there
pass
table = constrain_column_types(table, arpeggio_types)

if table.empty:
raise ValueError('{} resulted in an empty DataFrame...'.format(inputfile))
Expand Down Expand Up @@ -184,14 +175,8 @@ def parse_arpeggio_spec_from_file(inputfile, excluded=(), add_res_split=True,
if add_res_split:
table = add_arpeggio_res_split(table)

if excluded is not None:
excluded = tuple([k for k in excluded if k in header])
assert type(excluded) is tuple
try:
table = table.drop(list(excluded), axis=1)
except ValueError:
# most likely theses are not in there
pass
# excluding columns
table = exclude_columns(table, excluded=excluded)

if table.empty:
raise ValueError('{} resulted in an empty DataFrame...'.format(inputfile))
Expand Down
19 changes: 5 additions & 14 deletions prointvar/dssp.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@
from prointvar.utils import get_rsa_class
from prointvar.utils import row_selector
from prointvar.utils import lazy_file_remover
from prointvar.utils import constrain_column_types
from prointvar.utils import exclude_columns
from prointvar.library import dssp_types

from prointvar.config import config
Expand Down Expand Up @@ -173,22 +175,11 @@ def parse_dssp_from_file(inputfile, excluded=(), add_full_chain=True, add_ss_red
table['LINE'] = table.index + 1
logger.info("DSSP reset residue number...")

if excluded is not None:
assert type(excluded) is tuple
try:
table = table.drop(list(excluded), axis=1)
except ValueError:
# most likely theses are not in there
pass
# excluding columns
table = exclude_columns(table, excluded=excluded)

# enforce some specific column types
for col in table:
if col in dssp_types:
try:
table[col] = table[col].astype(dssp_types[col])
except ValueError:
# there are some NaNs in there
pass
table = constrain_column_types(table, dssp_types)

if table.empty:
raise ValueError('{} resulted in an empty DataFrame...'.format(inputfile))
Expand Down
19 changes: 5 additions & 14 deletions prointvar/hbplus.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@

from prointvar.utils import row_selector
from prointvar.utils import lazy_file_remover
from prointvar.utils import constrain_column_types
from prointvar.utils import exclude_columns
from prointvar.library import hbplus_types

from prointvar.config import config
Expand Down Expand Up @@ -88,22 +90,11 @@ def fix_res_id(data, key):
table.INSCODE_D[table.INSCODE_D == "-"] = "?"
table.INSCODE_A[table.INSCODE_A == "-"] = "?"

if excluded is not None:
assert type(excluded) is tuple
try:
table = table.drop(list(excluded), axis=1)
except ValueError:
# most likely theses are not in there
pass
# excluding columns
table = exclude_columns(table, excluded=excluded)

# enforce some specific column types
for col in table:
if col in hbplus_types:
try:
table[col] = table[col].astype(hbplus_types[col])
except ValueError:
# there are some NaNs in there
pass
table = constrain_column_types(table, hbplus_types)

if table.empty:
raise ValueError('{} resulted in an empty DataFrame...'.format(inputfile))
Expand Down
17 changes: 4 additions & 13 deletions prointvar/msas.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@
from Bio import AlignIO

from prointvar.fetchers import fetch_uniprot_id_from_name
from prointvar.utils import constrain_column_types
from prointvar.utils import exclude_columns

logger = logging.getLogger("prointvar")

Expand Down Expand Up @@ -82,22 +84,11 @@ def parse_msa_sequences_from_file(inputfile, excluded=(), get_uniprot_id=False,
table = pd.DataFrame(rows)

# excluding columns
if excluded is not None:
assert type(excluded) is tuple
try:
table = table.drop(list(excluded), axis=1)
except ValueError:
# most likely theses are not in there
pass
table = exclude_columns(table, excluded=excluded)

# enforce some specific column types
msa_types = {key: str for key in list(table) if key != 'Start' and key != 'End'}
for col in table:
try:
table[col] = table[col].astype(msa_types[col])
except (ValueError, KeyError):
# there are some NaNs in there
pass
table = constrain_column_types(table, msa_types)

if table.empty:
raise ValueError('{} resulted in an empty DataFrame...'.format(inputfile))
Expand Down
34 changes: 6 additions & 28 deletions prointvar/pdbx.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@
from prointvar.utils import string_split
from prointvar.utils import get_new_pro_ids
from prointvar.utils import check_sequence
from prointvar.utils import constrain_column_types
from prointvar.utils import exclude_columns
from prointvar.library import mmcif_types
from prointvar.library import aa_default_atoms
from prointvar.library import aa_codes_3to1_extended
Expand Down Expand Up @@ -94,13 +96,7 @@ def parse_mmcif_atoms_from_file(inputfile, excluded=(), add_res_full=True,
keep_default_na=False)

# excluding columns
if excluded is not None:
assert type(excluded) is tuple
try:
table = table.drop(list(excluded), axis=1)
except ValueError:
# most likely theses are not in there
pass
table = exclude_columns(table, excluded=excluded)

# if only first model (>1 in NMR structures)
if first_model:
Expand Down Expand Up @@ -144,13 +140,7 @@ def parse_mmcif_atoms_from_file(inputfile, excluded=(), add_res_full=True,
logger.info("PDBx reset atom numbers...")

# enforce some specific column types
for col in table:
if col in mmcif_types:
try:
table[col] = table[col].astype(mmcif_types[col])
except ValueError:
# there are some NaNs in there
pass
table = constrain_column_types(table, mmcif_types)

if table.empty:
raise ValueError('{} resulted in an empty DataFrame...'.format(inputfile))
Expand Down Expand Up @@ -226,13 +216,7 @@ def parse_pdb_atoms_from_file(inputfile, excluded=(), add_contacts=False,
compression=None, converters=all_str, keep_default_na=False)

# excluding columns
if excluded is not None:
assert type(excluded) is tuple
try:
table = table.drop(list(excluded), axis=1)
except ValueError:
# most likely theses are not in there
pass
table = exclude_columns(table, excluded=excluded)

# if only first model (>1 in NMR structures)
if first_model:
Expand Down Expand Up @@ -279,13 +263,7 @@ def parse_pdb_atoms_from_file(inputfile, excluded=(), add_contacts=False,
logger.info("PDBx reset atom numbers...")

# enforce some specific column types
for col in table:
if col in mmcif_types:
try:
table[col] = table[col].astype(mmcif_types[col])
except ValueError:
# there are some NaNs in there
pass
table = constrain_column_types(table, mmcif_types)

if table.empty:
raise ValueError('{} resulted in an empty DataFrame...'.format(inputfile))
Expand Down
9 changes: 2 additions & 7 deletions prointvar/sifts.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
from collections import OrderedDict

from prointvar.utils import row_selector
from prointvar.utils import constrain_column_types
from prointvar.library import sifts_types

logger = logging.getLogger("prointvar")
Expand Down Expand Up @@ -210,13 +211,7 @@ def parse_sifts_residues_from_file(inputfile, excluded=(),
table = pd.DataFrame(rows)

# enforce some specific column types
for col in table:
if col in sifts_types:
try:
table[col] = table[col].astype(sifts_types[col])
except ValueError:
# there are some NaNs in there
pass
table = constrain_column_types(table, sifts_types)

for c in list(table):
if '_regionId' in c:
Expand Down
18 changes: 4 additions & 14 deletions prointvar/stamp.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@

from prointvar.pdbx import PDBXwriter

from prointvar.utils import constrain_column_types
from prointvar.utils import exclude_columns
from prointvar.library import stamp_types

from prointvar.config import config
Expand Down Expand Up @@ -228,22 +230,10 @@ def parse_stamp_scan_scores_from_file(inputfile, excluded=()):
keep_default_na=False)

# excluding columns
if excluded is not None:
assert type(excluded) is tuple
try:
table = table.drop(list(excluded), axis=1)
except ValueError:
# most likely theses are not in there
pass
table = exclude_columns(table, excluded=excluded)

# enforce some specific column types
for col in table:
if col in stamp_types:
try:
table[col] = table[col].astype(stamp_types[col])
except ValueError:
# there are some NaNs in there
pass
table = constrain_column_types(table, stamp_types)

if table.empty:
raise ValueError('{} resulted in an empty DataFrame...'.format(inputfile))
Expand Down
45 changes: 45 additions & 0 deletions prointvar/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -679,5 +679,50 @@ def get_start_end_ranges_consecutive_ints(data):
return tuple(starts), tuple(ends)


def constrain_column_types(data, dictionary, nan_value=None):
"""
Helper method that helps in constrain data types for the
various DataFrame columns.
:param data: pandas DataFrame
:param dictionary: (dict) defines common types
:param nan_value: optional new value passed to replace NaNs
:return: modified pandas DataFrame
"""

table = data
for col in table:
if col in dictionary:
try:
table[col] = table[col].astype(dictionary[col])
except (ValueError, KeyError):
# probably there are some NaNs in there
pass
if table[col].isnull().any().any() and nan_value is not None:
table[col] = table[col].fillna(nan_value, axis=1)
return table


def exclude_columns(data, excluded=()):
"""
Helper method that helps in filtering out columns based
on the column name.
:param data: pandas DataFrame
:param excluded: (tuple) optional columns to be excluded
:return: modified pandas DataFrame
"""

table = data
if excluded is not None:
assert type(excluded) is tuple
try:
table = table.drop(list(excluded), axis=1)
except ValueError:
# most likely theses are not in there
pass
return table


if __name__ == '__main__':
pass
36 changes: 8 additions & 28 deletions prointvar/variants.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,8 @@
from prointvar.utils import merging_down_by_key
from prointvar.utils import flatten_nested_structure
from prointvar.utils import refactor_key_val_singletons
from prointvar.utils import constrain_column_types
from prointvar.utils import exclude_columns
from prointvar.library import uni_ens_var_types
from prointvar.library import update_ensembl_to_uniprot

Expand Down Expand Up @@ -58,22 +60,11 @@ def flatten_uniprot_variants_ebi(data, excluded=()):

table = pd.DataFrame(var_rows)

if excluded is not None:
assert type(excluded) is tuple
try:
table = table.drop(list(excluded), axis=1)
except ValueError:
# most likely theses are not in there
pass
# excluding columns
table = exclude_columns(table, excluded=excluded)

# enforce some specific column types
for col in table:
if col in uni_ens_var_types:
try:
table[col] = table[col].astype(uni_ens_var_types[col])
except ValueError:
# there are some NaNs in there
pass
table = constrain_column_types(table, uni_ens_var_types)

# split multi id rows
table = splitting_up_by_key(table, key='xrefs_id')
Expand Down Expand Up @@ -107,22 +98,11 @@ def flatten_ensembl_variants(data, excluded=(), synonymous=True):
# rename columns
table.rename(columns=update_ensembl_to_uniprot, inplace=True)

if excluded is not None:
assert type(excluded) is tuple
try:
table = table.drop(list(excluded), axis=1)
except ValueError:
# most likely theses are not in there
pass
# excluding columns
table = exclude_columns(table, excluded=excluded)

# enforce some specific column types
for col in table:
if col in uni_ens_var_types:
try:
table[col] = table[col].astype(uni_ens_var_types[col])
except ValueError:
# there are some NaNs in there
pass
table = constrain_column_types(table, uni_ens_var_types)

# split multi id rows
table = splitting_up_by_key(table, key='xrefs_id')
Expand Down

0 comments on commit 7c17572

Please sign in to comment.