diff --git a/prointvar/pdbx.py b/prointvar/pdbx.py index 35b8008..865acf8 100644 --- a/prointvar/pdbx.py +++ b/prointvar/pdbx.py @@ -151,7 +151,8 @@ def parse_pdb_atoms_from_file(inputfile, excluded=(), add_contacts=False, dist=5, first_model=True, add_atom_altloc=False, remove_altloc=False, remove_hydrogens=True, reset_atom_id=True, add_new_pro_id=False, - remove_partial_res=True): + remove_partial_res=True, fix_label_alt_id=True, + fix_ins_code=True, fix_type_symbol=True): """ Parse PDB ATOM and HETATM lines. @@ -166,6 +167,9 @@ def parse_pdb_atoms_from_file(inputfile, excluded=(), add_contacts=False, :param reset_atom_id: boolean :param add_new_pro_id: (boolean) used for chain_id mapping :param remove_partial_res: (boolean) removes amino acids with missing atoms + :param fix_label_alt_id: boolean + :param fix_ins_code: boolean + :param fix_type_symbol: boolean :return: returns a pandas DataFrame """ @@ -222,11 +226,14 @@ def parse_pdb_atoms_from_file(inputfile, excluded=(), add_contacts=False, table = row_selector(table, key='pdbx_PDB_model_num', value='first') # fixes the 'pdbx_PDB_ins_code' - table = fix_pdb_ins_code(table) + if fix_ins_code: + table = pdb_fix_pdb_ins_code(table) # fixes the 'label_alt_id - table = fix_label_alt_id(table) + if fix_label_alt_id: + table = pdb_fix_label_alt_id(table) # fixes 'type_symbol' if missing - table = fix_type_symbol(table) + if fix_type_symbol: + table = pdb_fix_type_symbol(table) # table modular extensions if add_contacts: @@ -541,7 +548,7 @@ def add_mmcif_res_full(data): return table -def fix_pdb_ins_code(data): +def pdb_fix_pdb_ins_code(data): """ Utility that fixes the 'pdbx_PDB_ins_code' column to match is expected in the mmcif format. @@ -551,18 +558,12 @@ def fix_pdb_ins_code(data): """ table = data - ins_codes = [] - for i in table.index: - value = table.loc[i, "pdbx_PDB_ins_code"] - if value == '' or value == ' ' or value == '?': - value = '?' - ins_codes.append(value) - table["pdbx_PDB_ins_code"] = ins_codes + table['pdbx_PDB_ins_code'] = table['pdbx_PDB_ins_code'].str.replace('| ', '?') table['pdbx_PDB_ins_code'] = table['pdbx_PDB_ins_code'].fillna('?').astype(str) return table -def fix_label_alt_id(data): +def pdb_fix_label_alt_id(data): """ Utility that fixes the 'label_alt_id' column to match what is expected in the mmCIF format. @@ -572,18 +573,12 @@ def fix_label_alt_id(data): """ table = data - alt_locs = [] - for i in table.index: - value = table.loc[i, "label_alt_id"] - if value == '' or value == ' ' or value == '?': - value = '.' - alt_locs.append(value) - table["label_alt_id"] = alt_locs + table['label_alt_id'] = table['label_alt_id'].str.replace('""|" "|"?"', '.') table['label_alt_id'] = table['label_alt_id'].fillna('.').astype(str) return table -def fix_type_symbol(data): +def pdb_fix_type_symbol(data): """ Utility that fixes the 'type_symbol' column to match what is expected in the mmCIF format - when missing in the Structure. diff --git a/tests/test_pdbx.py b/tests/test_pdbx.py index 9509eda..b3d5f5b 100644 --- a/tests/test_pdbx.py +++ b/tests/test_pdbx.py @@ -27,7 +27,7 @@ add_mmcif_atom_altloc, residues_aggregation, remove_multiple_altlocs, add_mmcif_new_pro_ids, remove_partial_residues, get_coordinates, get_sequence, - fix_label_alt_id, fix_pdb_ins_code, fix_type_symbol, + pdb_fix_label_alt_id, pdb_fix_pdb_ins_code, pdb_fix_type_symbol, get_real_residue_ranges) from prointvar.config import config as c @@ -72,9 +72,9 @@ def setUp(self): self.remove_altloc = remove_multiple_altlocs self.add_mmcif_new_pro_ids = add_mmcif_new_pro_ids self.remove_partial_residues = remove_partial_residues - self.fix_label_alt_id = fix_label_alt_id - self.fix_pdb_ins_code = fix_pdb_ins_code - self.fix_type_symbol = fix_type_symbol + self.fix_label_alt_id = pdb_fix_label_alt_id + self.fix_pdb_ins_code = pdb_fix_pdb_ins_code + self.fix_type_symbol = pdb_fix_type_symbol self.get_coordinates = get_coordinates self.get_sequence = get_sequence self.get_real_residue_ranges = get_real_residue_ranges @@ -569,23 +569,27 @@ def test_remove_partial_res(self): self.assertEqual(8, list(data.label_seq_id.tolist()).count('25')) def test_fix_label_alt_id(self): - reader = self.reader(self.inputpdb2) - data = reader.atoms(format_type='pdb') + data = self.parser_pdb(self.inputpdb2, fix_label_alt_id=False) + self.assertEqual(1, data.loc[0, 'id']) + self.assertEqual('N', data.loc[0, 'label_atom_id']) + self.assertEqual('A', data.loc[0, 'label_alt_id']) data = self.fix_label_alt_id(data) self.assertEqual(1, data.loc[0, 'id']) self.assertEqual('N', data.loc[0, 'label_atom_id']) self.assertEqual('A', data.loc[0, 'label_alt_id']) def test_fix_pdb_ins_code(self): - reader = self.reader(self.inputpdb2) - data = reader.atoms(format_type='pdb') + data = self.parser_pdb(self.inputpdb2, fix_ins_code=False) + self.assertEqual(1, data.loc[0, 'id']) + self.assertEqual('', data.loc[0, 'pdbx_PDB_ins_code']) data = self.fix_pdb_ins_code(data) self.assertEqual(1, data.loc[0, 'id']) self.assertEqual('?', data.loc[0, 'pdbx_PDB_ins_code']) def test_fix_type_symbol(self): - reader = self.reader(self.inputpdb2) - data = reader.atoms(format_type='pdb') + data = self.parser_pdb(self.inputpdb2, fix_type_symbol=False) + self.assertEqual(1, data.loc[0, 'id']) + self.assertEqual('', data.loc[0, 'type_symbol']) data = self.fix_type_symbol(data) self.assertEqual(1, data.loc[0, 'id']) self.assertEqual('N', data.loc[0, 'type_symbol']) diff --git a/tests/testdata/pdbx/1ejg.pdb b/tests/testdata/pdbx/1ejg.pdb index 96e2a12..1002d5b 100644 --- a/tests/testdata/pdbx/1ejg.pdb +++ b/tests/testdata/pdbx/1ejg.pdb @@ -273,25 +273,25 @@ ORIGX3 0.000000 0.000000 1.000000 0.00000 SCALE1 0.024495 0.000000 0.000201 0.00000 SCALE2 0.000000 0.054060 0.000000 0.00000 SCALE3 0.000000 0.000000 0.044702 0.00000 -ATOM 1 N ATHR A 1 16.885 14.078 3.427 0.82 4.48 N -ANISOU 1 N ATHR A 1 434 531 735 201 133 -28 N -ATOM 2 N BTHR A 1 17.553 14.234 4.214 0.18 5.51 N -ATOM 3 CA ATHR A 1 16.938 12.834 4.234 0.82 3.12 C -ANISOU 3 CA ATHR A 1 305 437 441 39 103 -65 C -ATOM 4 CA BTHR A 1 16.985 12.901 4.228 0.18 16.71 C -ATOM 5 C THR A 1 15.642 12.760 4.985 1.00 3.28 C -ANISOU 5 C THR A 1 338 431 477 -32 137 -43 C -ATOM 6 O THR A 1 15.150 13.818 5.439 1.00 5.91 O -ANISOU 6 O THR A 1 476 455 1311 -213 289 -42 O -ATOM 7 CB ATHR A 1 18.103 12.875 5.202 0.82 4.05 C -ANISOU 7 CB ATHR A 1 249 662 627 23 60 -110 C -ATOM 8 CB BTHR A 1 17.983 11.950 4.917 0.18 8.95 C -ATOM 9 OG1ATHR A 1 19.256 13.004 4.401 0.82 5.75 O -ANISOU 9 OG1ATHR A 1 294 1049 842 134 158 -125 O -ATOM 10 OG1BTHR A 1 17.792 10.543 4.939 0.18 9.94 O -ATOM 11 CG2ATHR A 1 18.173 11.582 6.055 0.82 5.30 C -ANISOU 11 CG2ATHR A 1 343 850 818 215 -11 -32 C -ATOM 12 CG2BTHR A 1 18.117 12.174 6.499 0.18 6.78 C +ATOM 1 N ATHR A 1 16.885 14.078 3.427 0.82 4.48 +ANISOU 1 N ATHR A 1 434 531 735 201 133 -28 +ATOM 2 N BTHR A 1 17.553 14.234 4.214 0.18 5.51 +ATOM 3 CA ATHR A 1 16.938 12.834 4.234 0.82 3.12 +ANISOU 3 CA ATHR A 1 305 437 441 39 103 -65 +ATOM 4 CA BTHR A 1 16.985 12.901 4.228 0.18 16.71 +ATOM 5 C THR A 1 15.642 12.760 4.985 1.00 3.28 +ANISOU 5 C THR A 1 338 431 477 -32 137 -43 +ATOM 6 O THR A 1 15.150 13.818 5.439 1.00 5.91 +ANISOU 6 O THR A 1 476 455 1311 -213 289 -42 +ATOM 7 CB ATHR A 1 18.103 12.875 5.202 0.82 4.05 +ANISOU 7 CB ATHR A 1 249 662 627 23 60 -110 +ATOM 8 CB BTHR A 1 17.983 11.950 4.917 0.18 8.95 +ATOM 9 OG1ATHR A 1 19.256 13.004 4.401 0.82 5.75 +ANISOU 9 OG1ATHR A 1 294 1049 842 134 158 -125 +ATOM 10 OG1BTHR A 1 17.792 10.543 4.939 0.18 9.94 +ATOM 11 CG2ATHR A 1 18.173 11.582 6.055 0.82 5.30 +ANISOU 11 CG2ATHR A 1 343 850 818 215 -11 -32 +ATOM 12 CG2BTHR A 1 18.117 12.174 6.499 0.18 6.78 ATOM 13 H1 ATHR A 1 17.705 14.139 2.840 0.82 6.68 H ATOM 14 H1 BTHR A 1 18.510 14.188 3.898 0.18 7.89 H ATOM 15 H2 ATHR A 1 16.879 14.894 4.049 0.82 6.68 H