Skip to content

Commit

Permalink
Merge pull request #333 from MannLabs/simplify-dataset-init
Browse files Browse the repository at this point in the history
simplify-dataset-init
  • Loading branch information
mschwoer authored Sep 20, 2024
2 parents 963a91f + 02b4da2 commit 629b03c
Show file tree
Hide file tree
Showing 6 changed files with 221 additions and 166 deletions.
231 changes: 76 additions & 155 deletions alphastats/DataSet.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import List, Union, Dict, Optional
from typing import List, Union, Dict, Optional, Tuple

import pandas as pd
import numpy as np
Expand All @@ -20,6 +20,7 @@
from alphastats.plots.IntensityPlot import IntensityPlot
from alphastats.plots.SampleHistogram import SampleHistogram
from alphastats.plots.VolcanoPlot import VolcanoPlot
from alphastats.dataset_factory import DataSetFactory

plotly.io.templates["alphastats_colors"] = plotly.graph_objects.layout.Template(
layout=plotly.graph_objects.Layout(
Expand Down Expand Up @@ -48,14 +49,14 @@ class DataSet:
def __init__(
self,
loader: BaseLoader,
metadata_path: Optional[str] = None,
metadata_path: Optional[Union[str, pd.DataFrame]] = None,
sample_column: Optional[str] = None,
):
"""Create DataSet
Args:
loader (_type_): loader of class AlphaPeptLoader, MaxQuantLoader, DIANNLoader, FragPipeLoader, SpectronautLoader
metadata_path (str, optional): path to metadata file. Defaults to None.
metadata_path (str or pd.DataFrame, optional): path to metadata file or an actual df. Defaults to None.
sample_column (str, optional): column in metadata file indicating the sample IDs. Defaults to None.
Attributes of a DataSet instance:
Expand All @@ -66,47 +67,79 @@ def __init__(
"""
self._check_loader(loader=loader)

# fill data from loader
self.rawinput: pd.DataFrame = loader.rawinput
self.software: str = loader.software
self.index_column: str = loader.index_column
self.intensity_column: Union[str, list] = loader.intensity_column
self.filter_columns: List[str] = loader.filter_columns
self.evidence_df: pd.DataFrame = loader.evidence_df
self.gene_names: str = loader.gene_names

# include filtering before
self._create_matrix()
self._check_matrix_values()

self.metadata: pd.DataFrame
self.sample: str
if metadata_path is not None:
self.sample = sample_column
self.metadata = self._load_metadata(file_path=metadata_path)
self._remove_misc_samples_in_metadata()
else:
self.sample = "sample"
self.metadata = pd.DataFrame({"sample": list(self.mat.index)})

if loader == "Generic":
intensity_column = loader._extract_sample_names(
self.index_column: str = loader.index_column
self.software: str = loader.software
self._gene_names: str = loader.gene_names

self._intensity_column: Union[str, list] = (
loader._extract_sample_names(
metadata=self.metadata, sample_column=self.sample
)
self.intensity_column = intensity_column
if loader == "Generic"
else loader.intensity_column
)

# init preprocessing settings
self.preprocessing_info: Dict = Preprocess.init_preprocessing_info(
num_samples=self.mat.shape[0],
num_protein_groups=self.mat.shape[1],
intensity_column=self.intensity_column,
filter_columns=self.filter_columns,
# self.evidence_df: pd.DataFrame = loader.evidence_df # TODO unused

self._dataset_factory = DataSetFactory(
rawinput=self.rawinput,
index_column=self.index_column,
intensity_column=self._intensity_column,
metadata_path=metadata_path,
sample_column=sample_column,
)

self.preprocessed = False
self.preprocessed: bool = False
rawmat, mat, metadata, sample, preprocessing_info = self._get_init_dataset()
self.rawmat: pd.DataFrame = rawmat
self.mat: pd.DataFrame = mat
self.metadata: pd.DataFrame = metadata
self.sample: str = sample
self.preprocessing_info: Dict = preprocessing_info

print("DataSet has been created.")

def _get_init_dataset(
self,
) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, str, Dict]:
"""Get the initial data structure for the DataSet."""
rawmat, mat = self._dataset_factory.create_matrix_from_rawinput()

metadata, sample = self._dataset_factory.create_metadata(mat)

preprocessing_info = Preprocess.init_preprocessing_info(
num_samples=mat.shape[0],
num_protein_groups=mat.shape[1],
intensity_column=self._intensity_column,
filter_columns=self.filter_columns,
)

return rawmat, mat, metadata, sample, preprocessing_info

def _check_loader(self, loader):
"""Checks if the Loader is from class AlphaPeptLoader, MaxQuantLoader, DIANNLoader, FragPipeLoader
Args:
loader : loader
"""
if not isinstance(loader, BaseLoader):
raise LoaderError(
"loader must be a subclass of BaseLoader, "
f"got {loader.__class__.__name__}"
)

if not isinstance(loader.rawinput, pd.DataFrame) or loader.rawinput.empty:
raise ValueError(
"Error in rawinput, consider reloading your data with: AlphaPeptLoader, MaxQuantLoader, DIANNLoader, FragPipeLoader, SpectronautLoader"
)

if not isinstance(loader.index_column, str):
raise ValueError(
"Invalid index_column: consider reloading your data with: AlphaPeptLoader, MaxQuantLoader, DIANNLoader, FragPipeLoader, SpectronautLoader"
)

def _get_preprocess(self) -> Preprocess:
"""Return instance of the Preprocess object."""
return Preprocess(
Expand Down Expand Up @@ -143,21 +176,16 @@ def preprocess(
**kwargs,
)
)
self.preprocessed = True

def reset_preprocessing(self):
"""Reset all preprocessing steps"""
self._create_matrix()
self.preprocessing_info = Preprocess.init_preprocessing_info(
num_samples=self.mat.shape[0],
num_protein_groups=self.mat.shape[1],
intensity_column=self.intensity_column,
filter_columns=self.filter_columns,
)

self.preprocessed = False
# TODO fix bug: metadata is not reset/reloaded here
print("All preprocessing steps are reset.")
(
self.rawmat,
self.mat,
self.metadata,
self.sample,
self.preprocessing_info,
) = self._get_init_dataset()

def batch_correction(self, batch: str) -> None:
"""A wrapper for Preprocess.batch_correction(), see documentation there."""
Expand Down Expand Up @@ -340,7 +368,7 @@ def plot_volcano(
metadata=self.metadata,
sample=self.sample,
index_column=self.index_column,
gene_names=self.gene_names,
gene_names=self._gene_names,
preprocessing_info=self.preprocessing_info,
group1=group1,
group2=group2,
Expand Down Expand Up @@ -392,7 +420,7 @@ def plot_intensity(
mat=self.mat,
metadata=self.metadata,
sample=self.sample,
intensity_column=self.intensity_column,
intensity_column=self._intensity_column,
preprocessing_info=self.preprocessing_info,
protein_id=protein_id,
group=group,
Expand Down Expand Up @@ -477,110 +505,3 @@ def plot_dendrogram(
):
"""A wrapper for Plot.plot_dendrogram(), see documentation there."""
return self._get_plot().plot_dendrogram(linkagefun)

def _check_loader(self, loader):
"""Checks if the Loader is from class AlphaPeptLoader, MaxQuantLoader, DIANNLoader, FragPipeLoader
Args:
loader : loader
"""
if not isinstance(loader, BaseLoader):
raise LoaderError(
"loader must be a subclass of BaseLoader, "
f"got {loader.__class__.__name__}"
)

if not isinstance(loader.rawinput, pd.DataFrame) or loader.rawinput.empty:
raise ValueError(
"Error in rawinput, consider reloading your data with: AlphaPeptLoader, MaxQuantLoader, DIANNLoader, FragPipeLoader, SpectronautLoader"
)

if not isinstance(loader.index_column, str):
raise ValueError(
"Invalid index_column: consider reloading your data with: AlphaPeptLoader, MaxQuantLoader, DIANNLoader, FragPipeLoader, SpectronautLoader"
)

def _check_matrix_values(self):
if np.isinf(self.mat).values.sum() > 0:
logging.warning("Data contains infinite values.")

def _remove_misc_samples_in_metadata(self):
samples_matrix = self.mat.index.to_list()
samples_metadata = self.metadata[self.sample].to_list()
misc_samples = list(set(samples_metadata) - set(samples_matrix))
if len(misc_samples) > 0:
self.metadata = self.metadata[
~self.metadata[self.sample].isin(misc_samples)
]
logging.warning(
f"{misc_samples} are not described in the protein data and"
"are removed from the metadata."
)

def _create_matrix(self):
"""
Creates a matrix of the Outputfile, with columns displaying features (Proteins) and
rows the samples.
"""

df = self.rawinput
df = df.set_index(self.index_column)

if isinstance(self.intensity_column, str):
regex_find_intensity_columns = self.intensity_column.replace(
"[sample]", ".*"
)
df = df.filter(regex=(regex_find_intensity_columns), axis=1)
# remove Intensity so only sample names remain
substring_to_remove = regex_find_intensity_columns.replace(".*", "")
df.columns = df.columns.str.replace(substring_to_remove, "")

else:
df = df[self.intensity_column]

# transpose dataframe
mat = df.transpose()
mat.replace([np.inf, -np.inf], np.nan, inplace=True)
self.rawmat = mat

# remove proteins with only zero # TODO this is re-done in preprocessing
mat_no_zeros = mat.loc[:, (mat != 0).any(axis=0)]
self.mat = mat_no_zeros.astype(float)

def _load_metadata(
self, file_path: Union[pd.DataFrame, str]
) -> Optional[pd.DataFrame]:
"""Load metadata either xlsx, txt, csv or txt file
Args:
file_path: path to metadata file or metadata DataFrame # TODO disentangle this
"""
if isinstance(file_path, pd.DataFrame):
df = file_path
elif file_path.endswith(".xlsx"):
warnings.filterwarnings(
"ignore",
category=UserWarning,
module="openpyxl",
# message=r"/extension is not supported and will be removed/", # this somehow does not work here?
)
df = pd.read_excel(file_path)
# find robust way to detect file format
# else give file separation as variable
elif file_path.endswith(".txt") or file_path.endswith(".tsv"):
df = pd.read_csv(file_path, delimiter="\t")
elif file_path.endswith(".csv"):
df = pd.read_csv(file_path)
else:
logging.warning(
"WARNING: Metadata could not be read. \nMetadata has to be a .xslx, .tsv, .csv or .txt file"
)
return None

if df is not None and self.sample not in df.columns:
logging.error(f"sample_column: {self.sample} not found in {file_path}")

# check whether sample labeling matches protein data
# warnings.warn("WARNING: Sample names do not match sample labelling in protein data")
df.columns = df.columns.astype(str)
return df
6 changes: 6 additions & 0 deletions alphastats/DataSet_Preprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,9 @@
class PreprocessingStateKeys:
"""Keys for accessing the dictionary holding the information about preprocessing."""

# TODO disentangle these keys from the human-readably display strings
PREPROCESSING_DONE = "Preprocessing done"

RAW_DATA_NUM_PG = "Raw data number of Protein Groups"
NUM_PG = "Matrix= Number of ProteinIDs/ProteinGroups"
NUM_SAMPLES = "Matrix= Number of samples"
Expand Down Expand Up @@ -68,6 +71,7 @@ def init_preprocessing_info(
) -> Dict:
"""Initialize preprocessing info."""
return {
PreprocessingStateKeys.PREPROCESSING_DONE: False,
PreprocessingStateKeys.RAW_DATA_NUM_PG: num_protein_groups,
PreprocessingStateKeys.NUM_PG: num_protein_groups,
PreprocessingStateKeys.NUM_SAMPLES: num_samples,
Expand Down Expand Up @@ -111,6 +115,7 @@ def _remove_na_values(self, cut_off):
"Missing values have already been filtered. To apply another cutoff, reset preprocessing."
)
return

cut = 1 - cut_off

num_samples, num_proteins = self.mat.shape
Expand Down Expand Up @@ -450,6 +455,7 @@ def preprocess(

self.preprocessing_info.update(
{
PreprocessingStateKeys.PREPROCESSING_DONE: True,
PreprocessingStateKeys.NUM_PG: self.mat.shape[1],
}
)
Expand Down
Loading

0 comments on commit 629b03c

Please sign in to comment.