Skip to content

Commit

Permalink
Merge pull request #117 from MannLabs/development
Browse files Browse the repository at this point in the history
Release v1.1.1
  • Loading branch information
GeorgWa authored Nov 11, 2023
2 parents bbaecc3 + 24834f6 commit 105ec31
Show file tree
Hide file tree
Showing 60 changed files with 1,598 additions and 1,401 deletions.
2 changes: 1 addition & 1 deletion .bumpversion.cfg
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
[bumpversion]
current_version = 1.1.0
current_version = 1.1.1
commit = True
tag = False
parse = (?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+)(\-(?P<release>[a-z]+)(?P<build>\d+))?
Expand Down
6 changes: 4 additions & 2 deletions .github/workflows/pip_installation.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,12 @@ jobs:
runs-on: ${{ matrix.os }}
strategy:
matrix:
os: [ubuntu-latest, macOS-latest, windows-latest]
os: [ubuntu-latest, macOS-latest, windows-latest, macos-latest-xlarge]
steps:
- uses: actions/checkout@v3
- uses: conda-incubator/setup-miniconda@v2
with:
miniconda-version: "latest"
auto-update-conda: true
python-version: ${{ matrix.python-version }}
- name: Conda info
Expand All @@ -39,11 +40,12 @@ jobs:
runs-on: ${{ matrix.os }}
strategy:
matrix:
os: [ubuntu-latest, macOS-latest, windows-latest]
os: [ubuntu-latest, macOS-latest, windows-latest, macos-latest-xlarge]
steps:
- uses: actions/checkout@v3
- uses: conda-incubator/setup-miniconda@v2
with:
miniconda-version: "latest"
auto-update-conda: true
python-version: ${{ matrix.python-version }}
- name: Conda info
Expand Down
2 changes: 1 addition & 1 deletion alphabase/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@


__project__ = "alphabase"
__version__ = "1.1.0"
__version__ = "1.1.1"
__license__ = "Apache"
__description__ = "An infrastructure Python package of the AlphaX ecosystem"
__author__ = "Mann Labs"
Expand Down
8 changes: 8 additions & 0 deletions alphabase/constants/const_files/psm_reader.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -48,12 +48,20 @@ maxquant:
'intensity': 'Intensity'

modification_mapping:
'Dimethyl@K':
- 'K(Dimethyl)'
'Dimethyl@R':
- 'R(Dimethyl)'
'Dimethyl@Any N-term':
- '(Dimethyl)'
'Acetyl@Protein N-term':
- '_(Acetyl (Protein N-term))'
- '_(ac)'
'Carbamidomethyl@C':
- 'C(Carbamidomethyl (C))'
- 'C(Carbamidomethyl)'
'Oxidation@M':
- 'M(Oxidation)'
- 'M(Oxidation (M))'
- 'M(ox)'
'Phospho@S':
Expand Down
2 changes: 1 addition & 1 deletion alphabase/constants/modification.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ def load_mod_df(
modloss_importance_level=1,
):
global MOD_DF
MOD_DF = pd.read_table(tsv)
MOD_DF = pd.read_table(tsv,keep_default_na=False)
_df = MOD_DF[MOD_DF.mod_name.str.contains(' ', regex=False)].copy()
_df["mod_name"] = MOD_DF.mod_name.str.replace(' ', '_', regex=False)
MOD_DF = pd.concat(
Expand Down
4 changes: 2 additions & 2 deletions alphabase/peptide/fragment.py
Original file line number Diff line number Diff line change
Expand Up @@ -295,12 +295,12 @@ def update_sliced_fragment_dataframe(
frag_slice_list = [slice(start,end) for start,end in frag_start_end_list]
frag_slices = np.r_[tuple(frag_slice_list)]
if charged_frag_types is None or len(charged_frag_types)==0:
fragment_df.values[frag_slices, :] = values.astype(fragment_df.dtypes[0])
fragment_df.values[frag_slices, :] = values.astype(fragment_df.dtypes.iloc[0])
else:
charged_frag_idxes = [fragment_df.columns.get_loc(c) for c in charged_frag_types]
fragment_df.iloc[
frag_slices, charged_frag_idxes
] = values.astype(fragment_df.dtypes[0])
] = values.astype(fragment_df.dtypes.iloc[0])
return fragment_df

def get_sliced_fragment_dataframe(
Expand Down
6 changes: 6 additions & 0 deletions alphabase/peptide/mobility.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,12 @@ def mobility_to_ccs_for_df(

if 'precursor_mz' not in precursor_df.columns:
precursor_df = update_precursor_mz(precursor_df)
if precursor_df[mobility_column].isna().any():
print(f"NA/nan is detected in the `{mobility_column}` column, fillna with 0.0")
precursor_df[mobility_column] = precursor_df[mobility_column].fillna(0.0).astype(np.float64)
if (precursor_df[mobility_column]=="").any():
print(f"Empty string is detected in the `{mobility_column}` column, fill with 0.0")
precursor_df[mobility_column] = precursor_df[mobility_column].replace("",0.0).astype(np.float64)
return mobility_to_ccs_bruker(
precursor_df[mobility_column].values,
precursor_df.charge.values,
Expand Down
82 changes: 82 additions & 0 deletions alphabase/protein/fasta.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,10 @@
import os
import itertools
import copy
import ahocorasick
from tqdm import tqdm

import warnings

from Bio import SeqIO
from typing import Union
Expand Down Expand Up @@ -46,13 +50,16 @@ def read_fasta_file(fasta_filename:str=""):
parts = record.id.split("|") # pipe char
if len(parts) > 1:
id = parts[1]
gene_org = parts[2]
else:
id = record.name
gene_org = record.name
sequence = str(record.seq)
entry = {
"protein_id": id,
"full_name": record.name,
"gene_name": get_uniprot_gene_name(record.description),
"gene_org": gene_org,
"description": record.description,
"sequence": sequence,
}
Expand Down Expand Up @@ -1310,3 +1317,78 @@ def load_hdf(self, hdf_file:str, load_mod_seq:bool=False):
self.protein_df = _hdf.library.protein_df.values
except (AttributeError, KeyError, ValueError, TypeError):
print(f"No protein_df in {hdf_file}")

def annotate_precursor_df(
precursor_df : pd.DataFrame,
protein_df : pd.DataFrame,
):
"""Annotate a list of peptides with genes and proteins by using an ahocorasick automaton.
Parameters
----------
precursor_df : pd.DataFrame
A dataframe containing a sequence column.
protein_df : pd.DataFrame
protein dataframe containing `sequence` column.
Returns
-------
pd.DataFrame
updated precursor_df with `genes`, `proteins` and `cardinality` columns.
"""
if len(precursor_df) == 0:
return precursor_df

if len(protein_df) == 0:
return precursor_df

if 'sequence' not in precursor_df.columns:
raise SystemError('precursor_df must contain a sequence column')

peptide_df = pd.DataFrame({
'sequence': precursor_df['sequence'].unique()
})

# ahocorasick automaton will be used to index the protein_df
automaton = ahocorasick.Automaton()
for i, peptide_sequence in enumerate(peptide_df['sequence']):
automaton.add_word(peptide_sequence, i)
automaton.make_automaton()

genes = [[] for _ in range(len(peptide_df))]
proteins = [[] for _ in range(len(peptide_df))]

# iter as dictionary
for protein_entry in tqdm(protein_df.to_dict('records')):
idx = [idx for _, idx in automaton.iter(protein_entry['sequence'])]
idx = np.unique(idx)
if len(idx) > 0:
for i in idx:
genes[i].append(protein_entry['gene_org'])
proteins[i].append(protein_entry['protein_id'])

peptide_df['genes'] = [';'.join(g) for g in genes]
peptide_df['proteins'] = [';'.join(g) for g in proteins]
peptide_df['cardinality'] = [len(g) for g in genes]

if 'genes' in precursor_df.columns:
precursor_df.drop(columns=['genes'], inplace=True)

if 'proteins' in precursor_df.columns:
precursor_df.drop(columns=['proteins'], inplace=True)

if 'proteotypic' in precursor_df.columns:
precursor_df.drop(columns=['proteotypic'], inplace=True)

if 'cardinality' in precursor_df.columns:
precursor_df.drop(columns=['cardinality'], inplace=True)

failed_annotation = np.sum(peptide_df['genes'] == '')
if failed_annotation > 0:
warnings.warn(f'{failed_annotation} peptides could not be annotated')

return precursor_df.merge(peptide_df, on='sequence', how='left')
6 changes: 3 additions & 3 deletions alphabase/psm_reader/dia_psm_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ def _init_column_mapping(self):

def _load_file(self, filename):
self.csv_sep = self._get_table_delimiter(filename)
df = pd.read_csv(filename, sep=self.csv_sep)
df = pd.read_csv(filename, sep=self.csv_sep,keep_default_na=False)
self._find_mod_seq_column(df)
if 'ReferenceRun' in df.columns:
df.drop_duplicates([
Expand Down Expand Up @@ -126,7 +126,7 @@ def _init_column_mapping(self):

def _load_file(self, filename):
self.csv_sep = self._get_table_delimiter(filename)
df = pd.read_csv(filename, sep=self.csv_sep)
df = pd.read_csv(filename, sep=self.csv_sep,keep_default_na=False)

return df

Expand Down Expand Up @@ -188,7 +188,7 @@ def _init_column_mapping(self):
def _load_file(self, filename):
self.mod_seq_column = 'ModifiedSequence'
self.csv_sep = self._get_table_delimiter(filename)
df = pd.read_csv(filename, sep=self.csv_sep)
df = pd.read_csv(filename, sep=self.csv_sep,keep_default_na=False)
df[[self.mod_seq_column,'charge']] = df[
self.precursor_column
].str.split('.', expand=True, n=2)
Expand Down
36 changes: 27 additions & 9 deletions alphabase/psm_reader/maxquant_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -193,19 +193,37 @@ def _add_all_unimod(self):
self.modification_mapping[mod_name] = [unimod]

def _extend_mod_brackets(self):
"""update modification_mapping to include different bracket types.
"""

for key, mod_list in list(self.modification_mapping.items()):
extend_mods = []

mod_set = set(mod_list)
# extend bracket types of modifications
# K(Acetyl) -> K[Acetyl]
# (Phospho) -> _(Phospho)
# _[Phospho] -> _(Phospho)
for mod in mod_list:

if mod[1] == '(':
extend_mods.append(f'{mod[0]}[{mod[2:-1]}]')
mod_set.add(f'{mod[0]}[{mod[2:-1]}]')
elif mod[1] == '[':
extend_mods.append(f'{mod[0]}({mod[2:-1]})')
mod_set.add(f'{mod[0]}({mod[2:-1]})')

if mod.startswith('_'):
mod_set.add(f'{mod[1:]}')
elif mod.startswith('('):
mod_set.add(f'_{mod}')
mod_set.add(f'[{mod[1:-1]}]')
mod_set.add(f'_[{mod[1:-1]}]')
elif mod.startswith('['):
mod_set.add(f'_{mod}')
mod_set.add(f'({mod[1:-1]})')
mod_set.add(f'_({mod[1:-1]})')

self.modification_mapping[key] = list(mod_set)

self.modification_mapping[key].extend(extend_mods)

self.modification_mapping[key].extend(
[f'{mod[1:]}' for mod in mod_list if mod.startswith('_')]
)

def _translate_decoy(self, origin_df=None):
if 'decoy' in self._psm_df.columns:
Expand All @@ -220,7 +238,7 @@ def _init_column_mapping(self):

def _load_file(self, filename):
csv_sep = self._get_table_delimiter(filename)
df = pd.read_csv(filename, sep=csv_sep)
df = pd.read_csv(filename, sep=csv_sep,keep_default_na=False)
self._find_mod_seq_column(df)
df = df[~pd.isna(df['Retention time'])]
df.fillna('', inplace=True)
Expand Down
2 changes: 1 addition & 1 deletion alphabase/psm_reader/pfind_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,7 @@ def _translate_modifications(self):
pass

def _load_file(self, filename):
pfind_df = pd.read_csv(filename, index_col=False, sep='\t')
pfind_df = pd.read_csv(filename, index_col=False, sep='\t',keep_default_na=False)
pfind_df.fillna('', inplace=True)
pfind_df = pfind_df[pfind_df.Sequence != '']
pfind_df['raw_name'] = pfind_df[
Expand Down
33 changes: 27 additions & 6 deletions alphabase/psm_reader/psm_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import io
import pandas as pd
import numpy as np
import warnings

import alphabase.peptide.mobility as mobility
from alphabase.peptide.precursor import (
Expand All @@ -13,6 +14,8 @@
from alphabase.utils import get_delimiter
from alphabase.yaml_utils import load_yaml



def translate_other_modification(
mod_str: str,
mod_dict: dict
Expand All @@ -36,14 +39,20 @@ def translate_other_modification(
new mods in AlphaBase format seperated by ';'. if any
modification is not in `mod_dict`, return pd.NA.
'''
if not mod_str: return ""

if mod_str == "" : return "", []
ret_mods = []
unknown_mods = []
for mod in mod_str.split(';'):
if mod in mod_dict:
ret_mods.append(mod_dict[mod])
else:
return pd.NA
return ";".join(ret_mods)
unknown_mods.append(mod)

if len(unknown_mods) > 0:
return pd.NA, unknown_mods
else:
return ";".join(ret_mods), []

def keep_modifications(
mod_str: str,
Expand Down Expand Up @@ -425,7 +434,6 @@ def _translate_columns(self, origin_df:pd.DataFrame):
not 'spec_idx' in self._psm_df.columns
):
self._psm_df['spec_idx'] = self._psm_df.scan_num - 1
self._psm_df.fillna("",inplace=True)

def _transform_table(self, origin_df:pd.DataFrame):
"""
Expand Down Expand Up @@ -468,10 +476,23 @@ def _translate_modifications(self):
if `mod` in `mod_names` is
not in `self.modification_mapping`
'''
self._psm_df.mods = self._psm_df.mods.apply(

self._psm_df.mods, unknown_mods = zip(*self._psm_df.mods.apply(
translate_other_modification,
mod_dict=self.rev_mod_mapping
)
))

# accumulate unknown mods
unknwon_mod_set = set()
for mod_list in unknown_mods:
if len(mod_list) > 0:
unknwon_mod_set.update(mod_list)

if len(unknwon_mod_set) > 0:
warnings.warn(
f'Unknown modifications: {unknwon_mod_set}. Precursors with unknown modifications will be removed.'
)


def _post_process(self,
origin_df:pd.DataFrame
Expand Down
2 changes: 1 addition & 1 deletion alphabase/spectral_library/decoy.py
Original file line number Diff line number Diff line change
Expand Up @@ -224,7 +224,7 @@ def get_decoy_lib(self, name:str,
target_lib, **kwargs
)
else:
return None
raise ValueError(f'Decoy method {name} not found.')

decoy_lib_provider:SpecLibDecoyProvider = SpecLibDecoyProvider()
"""
Expand Down
Loading

0 comments on commit 105ec31

Please sign in to comment.