diff --git a/alphastats/DataSet.py b/alphastats/DataSet.py index b24b93fa..51535856 100644 --- a/alphastats/DataSet.py +++ b/alphastats/DataSet.py @@ -1,4 +1,4 @@ -from typing import List, Union, Dict, Optional +from typing import List, Union, Dict, Optional, Tuple import pandas as pd import numpy as np @@ -20,6 +20,7 @@ from alphastats.plots.IntensityPlot import IntensityPlot from alphastats.plots.SampleHistogram import SampleHistogram from alphastats.plots.VolcanoPlot import VolcanoPlot +from alphastats.dataset_factory import DataSetFactory plotly.io.templates["alphastats_colors"] = plotly.graph_objects.layout.Template( layout=plotly.graph_objects.Layout( @@ -48,14 +49,14 @@ class DataSet: def __init__( self, loader: BaseLoader, - metadata_path: Optional[str] = None, + metadata_path: Optional[Union[str, pd.DataFrame]] = None, sample_column: Optional[str] = None, ): """Create DataSet Args: loader (_type_): loader of class AlphaPeptLoader, MaxQuantLoader, DIANNLoader, FragPipeLoader, SpectronautLoader - metadata_path (str, optional): path to metadata file. Defaults to None. + metadata_path (str or pd.DataFrame, optional): path to metadata file or an actual df. Defaults to None. sample_column (str, optional): column in metadata file indicating the sample IDs. Defaults to None. Attributes of a DataSet instance: @@ -66,47 +67,79 @@ def __init__( """ self._check_loader(loader=loader) + # fill data from loader self.rawinput: pd.DataFrame = loader.rawinput - self.software: str = loader.software - self.index_column: str = loader.index_column - self.intensity_column: Union[str, list] = loader.intensity_column self.filter_columns: List[str] = loader.filter_columns - self.evidence_df: pd.DataFrame = loader.evidence_df - self.gene_names: str = loader.gene_names - - # include filtering before - self._create_matrix() - self._check_matrix_values() - - self.metadata: pd.DataFrame - self.sample: str - if metadata_path is not None: - self.sample = sample_column - self.metadata = self._load_metadata(file_path=metadata_path) - self._remove_misc_samples_in_metadata() - else: - self.sample = "sample" - self.metadata = pd.DataFrame({"sample": list(self.mat.index)}) - - if loader == "Generic": - intensity_column = loader._extract_sample_names( + self.index_column: str = loader.index_column + self.software: str = loader.software + self._gene_names: str = loader.gene_names + + self._intensity_column: Union[str, list] = ( + loader._extract_sample_names( metadata=self.metadata, sample_column=self.sample ) - self.intensity_column = intensity_column + if loader == "Generic" + else loader.intensity_column + ) - # init preprocessing settings - self.preprocessing_info: Dict = Preprocess.init_preprocessing_info( - num_samples=self.mat.shape[0], - num_protein_groups=self.mat.shape[1], - intensity_column=self.intensity_column, - filter_columns=self.filter_columns, + # self.evidence_df: pd.DataFrame = loader.evidence_df # TODO unused + + self._dataset_factory = DataSetFactory( + rawinput=self.rawinput, + index_column=self.index_column, + intensity_column=self._intensity_column, + metadata_path=metadata_path, + sample_column=sample_column, ) - self.preprocessed = False - self.preprocessed: bool = False + rawmat, mat, metadata, sample, preprocessing_info = self._get_init_dataset() + self.rawmat: pd.DataFrame = rawmat + self.mat: pd.DataFrame = mat + self.metadata: pd.DataFrame = metadata + self.sample: str = sample + self.preprocessing_info: Dict = preprocessing_info print("DataSet has been created.") + def _get_init_dataset( + self, + ) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, str, Dict]: + """Get the initial data structure for the DataSet.""" + rawmat, mat = self._dataset_factory.create_matrix_from_rawinput() + + metadata, sample = self._dataset_factory.create_metadata(mat) + + preprocessing_info = Preprocess.init_preprocessing_info( + num_samples=mat.shape[0], + num_protein_groups=mat.shape[1], + intensity_column=self._intensity_column, + filter_columns=self.filter_columns, + ) + + return rawmat, mat, metadata, sample, preprocessing_info + + def _check_loader(self, loader): + """Checks if the Loader is from class AlphaPeptLoader, MaxQuantLoader, DIANNLoader, FragPipeLoader + + Args: + loader : loader + """ + if not isinstance(loader, BaseLoader): + raise LoaderError( + "loader must be a subclass of BaseLoader, " + f"got {loader.__class__.__name__}" + ) + + if not isinstance(loader.rawinput, pd.DataFrame) or loader.rawinput.empty: + raise ValueError( + "Error in rawinput, consider reloading your data with: AlphaPeptLoader, MaxQuantLoader, DIANNLoader, FragPipeLoader, SpectronautLoader" + ) + + if not isinstance(loader.index_column, str): + raise ValueError( + "Invalid index_column: consider reloading your data with: AlphaPeptLoader, MaxQuantLoader, DIANNLoader, FragPipeLoader, SpectronautLoader" + ) + def _get_preprocess(self) -> Preprocess: """Return instance of the Preprocess object.""" return Preprocess( @@ -143,21 +176,16 @@ def preprocess( **kwargs, ) ) - self.preprocessed = True def reset_preprocessing(self): """Reset all preprocessing steps""" - self._create_matrix() - self.preprocessing_info = Preprocess.init_preprocessing_info( - num_samples=self.mat.shape[0], - num_protein_groups=self.mat.shape[1], - intensity_column=self.intensity_column, - filter_columns=self.filter_columns, - ) - - self.preprocessed = False - # TODO fix bug: metadata is not reset/reloaded here - print("All preprocessing steps are reset.") + ( + self.rawmat, + self.mat, + self.metadata, + self.sample, + self.preprocessing_info, + ) = self._get_init_dataset() def batch_correction(self, batch: str) -> None: """A wrapper for Preprocess.batch_correction(), see documentation there.""" @@ -340,7 +368,7 @@ def plot_volcano( metadata=self.metadata, sample=self.sample, index_column=self.index_column, - gene_names=self.gene_names, + gene_names=self._gene_names, preprocessing_info=self.preprocessing_info, group1=group1, group2=group2, @@ -392,7 +420,7 @@ def plot_intensity( mat=self.mat, metadata=self.metadata, sample=self.sample, - intensity_column=self.intensity_column, + intensity_column=self._intensity_column, preprocessing_info=self.preprocessing_info, protein_id=protein_id, group=group, @@ -477,110 +505,3 @@ def plot_dendrogram( ): """A wrapper for Plot.plot_dendrogram(), see documentation there.""" return self._get_plot().plot_dendrogram(linkagefun) - - def _check_loader(self, loader): - """Checks if the Loader is from class AlphaPeptLoader, MaxQuantLoader, DIANNLoader, FragPipeLoader - - Args: - loader : loader - """ - if not isinstance(loader, BaseLoader): - raise LoaderError( - "loader must be a subclass of BaseLoader, " - f"got {loader.__class__.__name__}" - ) - - if not isinstance(loader.rawinput, pd.DataFrame) or loader.rawinput.empty: - raise ValueError( - "Error in rawinput, consider reloading your data with: AlphaPeptLoader, MaxQuantLoader, DIANNLoader, FragPipeLoader, SpectronautLoader" - ) - - if not isinstance(loader.index_column, str): - raise ValueError( - "Invalid index_column: consider reloading your data with: AlphaPeptLoader, MaxQuantLoader, DIANNLoader, FragPipeLoader, SpectronautLoader" - ) - - def _check_matrix_values(self): - if np.isinf(self.mat).values.sum() > 0: - logging.warning("Data contains infinite values.") - - def _remove_misc_samples_in_metadata(self): - samples_matrix = self.mat.index.to_list() - samples_metadata = self.metadata[self.sample].to_list() - misc_samples = list(set(samples_metadata) - set(samples_matrix)) - if len(misc_samples) > 0: - self.metadata = self.metadata[ - ~self.metadata[self.sample].isin(misc_samples) - ] - logging.warning( - f"{misc_samples} are not described in the protein data and" - "are removed from the metadata." - ) - - def _create_matrix(self): - """ - Creates a matrix of the Outputfile, with columns displaying features (Proteins) and - rows the samples. - """ - - df = self.rawinput - df = df.set_index(self.index_column) - - if isinstance(self.intensity_column, str): - regex_find_intensity_columns = self.intensity_column.replace( - "[sample]", ".*" - ) - df = df.filter(regex=(regex_find_intensity_columns), axis=1) - # remove Intensity so only sample names remain - substring_to_remove = regex_find_intensity_columns.replace(".*", "") - df.columns = df.columns.str.replace(substring_to_remove, "") - - else: - df = df[self.intensity_column] - - # transpose dataframe - mat = df.transpose() - mat.replace([np.inf, -np.inf], np.nan, inplace=True) - self.rawmat = mat - - # remove proteins with only zero # TODO this is re-done in preprocessing - mat_no_zeros = mat.loc[:, (mat != 0).any(axis=0)] - self.mat = mat_no_zeros.astype(float) - - def _load_metadata( - self, file_path: Union[pd.DataFrame, str] - ) -> Optional[pd.DataFrame]: - """Load metadata either xlsx, txt, csv or txt file - - Args: - file_path: path to metadata file or metadata DataFrame # TODO disentangle this - """ - if isinstance(file_path, pd.DataFrame): - df = file_path - elif file_path.endswith(".xlsx"): - warnings.filterwarnings( - "ignore", - category=UserWarning, - module="openpyxl", - # message=r"/extension is not supported and will be removed/", # this somehow does not work here? - ) - df = pd.read_excel(file_path) - # find robust way to detect file format - # else give file separation as variable - elif file_path.endswith(".txt") or file_path.endswith(".tsv"): - df = pd.read_csv(file_path, delimiter="\t") - elif file_path.endswith(".csv"): - df = pd.read_csv(file_path) - else: - logging.warning( - "WARNING: Metadata could not be read. \nMetadata has to be a .xslx, .tsv, .csv or .txt file" - ) - return None - - if df is not None and self.sample not in df.columns: - logging.error(f"sample_column: {self.sample} not found in {file_path}") - - # check whether sample labeling matches protein data - # warnings.warn("WARNING: Sample names do not match sample labelling in protein data") - df.columns = df.columns.astype(str) - return df diff --git a/alphastats/DataSet_Preprocess.py b/alphastats/DataSet_Preprocess.py index 6236a6d8..1dfe27fa 100644 --- a/alphastats/DataSet_Preprocess.py +++ b/alphastats/DataSet_Preprocess.py @@ -16,6 +16,9 @@ class PreprocessingStateKeys: """Keys for accessing the dictionary holding the information about preprocessing.""" + # TODO disentangle these keys from the human-readably display strings + PREPROCESSING_DONE = "Preprocessing done" + RAW_DATA_NUM_PG = "Raw data number of Protein Groups" NUM_PG = "Matrix= Number of ProteinIDs/ProteinGroups" NUM_SAMPLES = "Matrix= Number of samples" @@ -68,6 +71,7 @@ def init_preprocessing_info( ) -> Dict: """Initialize preprocessing info.""" return { + PreprocessingStateKeys.PREPROCESSING_DONE: False, PreprocessingStateKeys.RAW_DATA_NUM_PG: num_protein_groups, PreprocessingStateKeys.NUM_PG: num_protein_groups, PreprocessingStateKeys.NUM_SAMPLES: num_samples, @@ -111,6 +115,7 @@ def _remove_na_values(self, cut_off): "Missing values have already been filtered. To apply another cutoff, reset preprocessing." ) return + cut = 1 - cut_off num_samples, num_proteins = self.mat.shape @@ -450,6 +455,7 @@ def preprocess( self.preprocessing_info.update( { + PreprocessingStateKeys.PREPROCESSING_DONE: True, PreprocessingStateKeys.NUM_PG: self.mat.shape[1], } ) diff --git a/alphastats/dataset_factory.py b/alphastats/dataset_factory.py new file mode 100644 index 00000000..7e619979 --- /dev/null +++ b/alphastats/dataset_factory.py @@ -0,0 +1,127 @@ +from typing import List, Union, Dict, Optional, Tuple + +import pandas as pd +import numpy as np +import logging +import warnings + + +class DataSetFactory: + """Create all 'heavy' data structures of a DataSet.""" + + def __init__( + self, + *, + rawinput: pd.DataFrame, + index_column: str, + intensity_column: Union[List[str], str], + metadata_path: Union[str, pd.DataFrame], + sample_column: str, + ): + self.rawinput: pd.DataFrame = rawinput + self.sample_column: str = sample_column + self.index_column: str = index_column + self.intensity_column: Union[List[str], str] = intensity_column + self.metadata_path: Union[str, pd.DataFrame] = metadata_path + + def create_matrix_from_rawinput(self) -> Tuple[pd.DataFrame, pd.DataFrame]: + """Creates a matrix: features (Proteins) as columns, samples as rows.""" + + df = self.rawinput + df = df.set_index(self.index_column) + + if isinstance(self.intensity_column, str): + regex_find_intensity_columns = self.intensity_column.replace( + "[sample]", ".*" + ) + df = df.filter(regex=regex_find_intensity_columns, axis=1) + # remove Intensity so only sample names remain + substring_to_remove = regex_find_intensity_columns.replace(".*", "") + df.columns = df.columns.str.replace(substring_to_remove, "") + + else: + df = df[self.intensity_column] + + rawmat = df.transpose() + rawmat.replace([np.inf, -np.inf], np.nan, inplace=True) + + # remove proteins with only zero # TODO this is re-done in preprocessing + mat_no_zeros = rawmat.loc[:, (rawmat != 0).any(axis=0)].astype(float) + + self._check_matrix_values(mat_no_zeros) + + return rawmat, mat_no_zeros + + @staticmethod + def _check_matrix_values(mat: pd.DataFrame) -> None: + """Check for infinite values in the matrix.""" + if np.isinf(mat).values.sum() > 0: + logging.warning("Data contains infinite values.") + + def create_metadata(self, mat: pd.DataFrame) -> Tuple[pd.DataFrame, str]: + """Create metadata DataFrame from metadata file or DataFrame.""" + + if self.metadata_path is not None: + sample = self.sample_column + metadata = self._load_metadata(file_path=self.metadata_path) + metadata = self._remove_missing_samples_from_metadata(mat, metadata, sample) + else: + sample = "sample" + metadata = pd.DataFrame({"sample": list(mat.index)}) + + return metadata, sample + + def _remove_missing_samples_from_metadata( + self, mat: pd.DataFrame, metadata: pd.DataFrame, sample + ) -> pd.DataFrame: + """Remove samples from metadata that are not in the protein data.""" + samples_matrix = mat.index.to_list() + samples_metadata = metadata[sample].to_list() + misc_samples = list(set(samples_metadata) - set(samples_matrix)) + if len(misc_samples) > 0: + metadata = metadata[~metadata[sample].isin(misc_samples)] + logging.warning( + f"{misc_samples} are not described in the protein data and" + "are removed from the metadata." + ) + return metadata + + def _load_metadata( + self, file_path: Union[pd.DataFrame, str] + ) -> Optional[pd.DataFrame]: + """Load metadata either xlsx, txt, csv or txt file + + Args: + file_path: path to metadata file or metadata DataFrame # TODO disentangle this + """ + if isinstance(file_path, pd.DataFrame): + df = file_path + elif file_path.endswith(".xlsx"): + warnings.filterwarnings( + "ignore", + category=UserWarning, + module="openpyxl", + # message=r"/extension is not supported and will be removed/", # this somehow does not work here? + ) + df = pd.read_excel(file_path) + # find robust way to detect file format + # else give file separation as variable + elif file_path.endswith(".txt") or file_path.endswith(".tsv"): + df = pd.read_csv(file_path, delimiter="\t") + elif file_path.endswith(".csv"): + df = pd.read_csv(file_path) + else: + logging.warning( + "WARNING: Metadata could not be read. \nMetadata has to be a .xslx, .tsv, .csv or .txt file" + ) + return None + + if df is not None and self.sample_column not in df.columns: + logging.error( + f"sample_column: {self.sample_column} not found in {file_path}" + ) + + # check whether sample labeling matches protein data + # warnings.warn("WARNING: Sample names do not match sample labelling in protein data") + df.columns = df.columns.astype(str) + return df diff --git a/alphastats/plots/VolcanoPlot.py b/alphastats/plots/VolcanoPlot.py index 082628b8..965376d4 100644 --- a/alphastats/plots/VolcanoPlot.py +++ b/alphastats/plots/VolcanoPlot.py @@ -12,6 +12,7 @@ import plotly from functools import lru_cache +# TODO this is repeated and needs to go elsewhere! plotly.io.templates["alphastats_colors"] = plotly.graph_objects.layout.Template( layout=plotly.graph_objects.Layout( paper_bgcolor="rgba(0,0,0,0)", diff --git a/tests/gui/test_02_import_data.py b/tests/gui/test_02_import_data.py index abdcbcb3..d768f531 100644 --- a/tests/gui/test_02_import_data.py +++ b/tests/gui/test_02_import_data.py @@ -110,9 +110,9 @@ def test_page_02_loads_maxquant_testfiles( assert not at.exception dataset = at.session_state[StateKeys.DATASET] - assert dataset.gene_names == "Gene names" + assert dataset._gene_names == "Gene names" assert dataset.index_column == "Protein IDs" - assert dataset.intensity_column == "LFQ intensity [sample]" + assert dataset._intensity_column == "LFQ intensity [sample]" assert dataset.rawmat.shape == (312, 2611) assert dataset.software == "MaxQuant" assert dataset.sample == "sample" diff --git a/tests/test_DataSet.py b/tests/test_DataSet.py index dbe8a7b5..ede7479d 100644 --- a/tests/test_DataSet.py +++ b/tests/test_DataSet.py @@ -19,8 +19,8 @@ from alphastats.loader.GenericLoader import GenericLoader from alphastats.DataSet import DataSet -from alphastats.DataSet_Statistics import Statistics from alphastats.utils import LoaderError +from alphastats.dataset_factory import DataSetFactory from alphastats.gui.utils.ui_helper import StateKeys logger = logging.getLogger(__name__) @@ -79,15 +79,15 @@ def test_load_metadata(self): def test_load_metadata_missing_sample_column(self, mock): # is error raised when name of sample column is missing path = self.metadata_path - self.obj.sample = "wrong_sample_column" - self.obj._load_metadata(file_path=path) + self.obj._dataset_factory.sample_column = "wrong_sample_column" + self.obj._dataset_factory._load_metadata(file_path=path) mock.assert_called_once() @patch("logging.Logger.warning") def test_load_metadata_warning(self, mock): # is dataframe None and is warning produced file_path = "wrong/file.xxx" - self.obj._load_metadata(file_path=file_path) + self.obj._dataset_factory._load_metadata(file_path=file_path) mock.assert_called_once() def test_create_matrix(self): @@ -107,8 +107,8 @@ def test_check_values_warning(self, mock): "B": [23, 22, 24, 22, 25], "C": [66, 72, np.inf, 68, -np.inf], } - self.obj.mat = pd.DataFrame(data) - self.obj._check_matrix_values() + mat = pd.DataFrame(data) + DataSetFactory._check_matrix_values(mat) mock.assert_called_once() @patch("logging.Logger.info") @@ -221,15 +221,15 @@ def test_dataset_without_metadata(self): def test_load_metadata_fileformats(self): # test if different fileformats get loaded correctly metadata_path = "testfiles/alphapept/metadata.txt" - self.obj._load_metadata(file_path=metadata_path) + self.obj._dataset_factory._load_metadata(file_path=metadata_path) self.assertEqual(self.obj.metadata.shape, (2, 2)) metadata_path = "testfiles/alphapept/metadata.tsv" - self.obj._load_metadata(file_path=metadata_path) + self.obj._dataset_factory._load_metadata(file_path=metadata_path) self.assertEqual(self.obj.metadata.shape, (2, 2)) metadata_path = "testfiles/alphapept/metadata.csv" - self.obj._load_metadata(file_path=metadata_path) + self.obj._dataset_factory._load_metadata(file_path=metadata_path) self.assertEqual(self.obj.metadata.shape, (2, 2)) @patch("logging.Logger.warning")