From 842f41833ff5e17f0007ea1b5f210cc388417582 Mon Sep 17 00:00:00 2001 From: mschwoerer <82171591+mschwoer@users.noreply.github.com> Date: Wed, 18 Sep 2024 17:47:04 +0200 Subject: [PATCH 1/6] reorder and simlify methods in DataSet constructor --- alphastats/DataSet.py | 86 +++++++++++++++++--------------- tests/gui/test_02_import_data.py | 4 +- tests/test_DataSet.py | 4 +- 3 files changed, 49 insertions(+), 45 deletions(-) diff --git a/alphastats/DataSet.py b/alphastats/DataSet.py index 21715fcb..112a7a35 100644 --- a/alphastats/DataSet.py +++ b/alphastats/DataSet.py @@ -1,4 +1,4 @@ -from typing import List, Union, Dict, Optional +from typing import List, Union, Dict, Optional, Tuple import pandas as pd import numpy as np @@ -66,44 +66,49 @@ def __init__( """ self._check_loader(loader=loader) + # fill data from loader self.rawinput: pd.DataFrame = loader.rawinput - self.software: str = loader.software - self.index_column: str = loader.index_column - self.intensity_column: Union[str, list] = loader.intensity_column self.filter_columns: List[str] = loader.filter_columns - self.evidence_df: pd.DataFrame = loader.evidence_df - self.gene_names: str = loader.gene_names + self.index_column: str = loader.index_column + self.software: str = loader.software + self._gene_names: str = loader.gene_names + # TODO this is used when creating the matrix, but then overwritten for Generic loaders later? + self._intensity_column: Union[str, list] = loader.intensity_column - # include filtering before - self._create_matrix() - self._check_matrix_values() + # self.evidence_df: pd.DataFrame = loader.evidence_df # TODO unused + # create matrix + self.rawmat: pd.DataFrame + self.mat: pd.DataFrame + self.rawmat, self.mat = self._create_matrix_from_rawinput() + self._check_matrix_values(self.mat) + + # create metadata self.metadata: pd.DataFrame self.sample: str if metadata_path is not None: self.sample = sample_column - self.metadata = self._load_metadata(file_path=metadata_path) - self._remove_misc_samples_in_metadata() + metadata = self._load_metadata(file_path=metadata_path) + self.metadata = self._remove_misc_samples_in_metadata(metadata) else: self.sample = "sample" self.metadata = pd.DataFrame({"sample": list(self.mat.index)}) if loader == "Generic": - intensity_column = loader._extract_sample_names( + self._intensity_column = loader._extract_sample_names( metadata=self.metadata, sample_column=self.sample ) - self.intensity_column = intensity_column - # init preprocessing settings self.preprocessing_info: Dict = Preprocess.init_preprocessing_info( num_samples=self.mat.shape[0], num_protein_groups=self.mat.shape[1], - intensity_column=self.intensity_column, + intensity_column=self._intensity_column, filter_columns=self.filter_columns, ) - self.preprocessed = False - self.preprocessed: bool = False + self._preprocessed: bool = ( + False # TODO could be moved to preprocessing_info dict + ) print("DataSet has been created.") @@ -143,19 +148,19 @@ def preprocess( **kwargs, ) ) - self.preprocessed = True + self._preprocessed = True def reset_preprocessing(self): """Reset all preprocessing steps""" - self._create_matrix() + self.rawmat, self.mat = self._create_matrix_from_rawinput() self.preprocessing_info = Preprocess.init_preprocessing_info( num_samples=self.mat.shape[0], num_protein_groups=self.mat.shape[1], - intensity_column=self.intensity_column, + intensity_column=self._intensity_column, filter_columns=self.filter_columns, ) - self.preprocessed = False + self._preprocessed = False # TODO fix bug: metadata is not reset/reloaded here print("All preprocessing steps are reset.") @@ -340,7 +345,7 @@ def plot_volcano( metadata=self.metadata, sample=self.sample, index_column=self.index_column, - gene_names=self.gene_names, + gene_names=self._gene_names, preprocessing_info=self.preprocessing_info, group1=group1, group2=group2, @@ -392,7 +397,7 @@ def plot_intensity( mat=self.mat, metadata=self.metadata, sample=self.sample, - intensity_column=self.intensity_column, + intensity_column=self._intensity_column, preprocessing_info=self.preprocessing_info, protein_id=protein_id, group=group, @@ -500,24 +505,24 @@ def _check_loader(self, loader): "Invalid index_column: consider reloading your data with: AlphaPeptLoader, MaxQuantLoader, DIANNLoader, FragPipeLoader, SpectronautLoader" ) - def _check_matrix_values(self): - if np.isinf(self.mat).values.sum() > 0: + @staticmethod + def _check_matrix_values(mat: pd.DataFrame) -> None: + if np.isinf(mat).values.sum() > 0: logging.warning("Data contains infinite values.") - def _remove_misc_samples_in_metadata(self): + def _remove_misc_samples_in_metadata(self, metadata: pd.DataFrame) -> pd.DataFrame: samples_matrix = self.mat.index.to_list() - samples_metadata = self.metadata[self.sample].to_list() + samples_metadata = metadata[self.sample].to_list() misc_samples = list(set(samples_metadata) - set(samples_matrix)) if len(misc_samples) > 0: - self.metadata = self.metadata[ - ~self.metadata[self.sample].isin(misc_samples) - ] + metadata = metadata[~metadata[self.sample].isin(misc_samples)] logging.warning( f"{misc_samples} are not described in the protein data and" "are removed from the metadata." ) + return metadata - def _create_matrix(self): + def _create_matrix_from_rawinput(self) -> Tuple[pd.DataFrame, pd.DataFrame]: """ Creates a matrix of the Outputfile, with columns displaying features (Proteins) and rows the samples. @@ -526,26 +531,25 @@ def _create_matrix(self): df = self.rawinput df = df.set_index(self.index_column) - if isinstance(self.intensity_column, str): - regex_find_intensity_columns = self.intensity_column.replace( + if isinstance(self._intensity_column, str): + regex_find_intensity_columns = self._intensity_column.replace( "[sample]", ".*" ) - df = df.filter(regex=(regex_find_intensity_columns), axis=1) + df = df.filter(regex=regex_find_intensity_columns, axis=1) # remove Intensity so only sample names remain substring_to_remove = regex_find_intensity_columns.replace(".*", "") df.columns = df.columns.str.replace(substring_to_remove, "") else: - df = df[self.intensity_column] + df = df[self._intensity_column] - # transpose dataframe - mat = df.transpose() - mat.replace([np.inf, -np.inf], np.nan, inplace=True) - self.rawmat = mat + rawmat = df.transpose() + rawmat.replace([np.inf, -np.inf], np.nan, inplace=True) # remove proteins with only zero # TODO this is re-done in preprocessing - mat_no_zeros = mat.loc[:, (mat != 0).any(axis=0)] - self.mat = mat_no_zeros.astype(float) + mat_no_zeros = rawmat.loc[:, (rawmat != 0).any(axis=0)] + + return rawmat, mat_no_zeros.astype(float) def _load_metadata( self, file_path: Union[pd.DataFrame, str] diff --git a/tests/gui/test_02_import_data.py b/tests/gui/test_02_import_data.py index a1f85cc4..1caef40c 100644 --- a/tests/gui/test_02_import_data.py +++ b/tests/gui/test_02_import_data.py @@ -137,9 +137,9 @@ def test_page_02_loads_maxquant_testfiles( assert not at.exception dataset = at.session_state.dataset - assert dataset.gene_names == "Gene names" + assert dataset._gene_names == "Gene names" assert dataset.index_column == "Protein IDs" - assert dataset.intensity_column == "LFQ intensity [sample]" + assert dataset._intensity_column == "LFQ intensity [sample]" assert dataset.rawmat.shape == (312, 2611) assert dataset.software == "MaxQuant" assert dataset.sample == "sample" diff --git a/tests/test_DataSet.py b/tests/test_DataSet.py index fdfa5ffe..73156c99 100644 --- a/tests/test_DataSet.py +++ b/tests/test_DataSet.py @@ -107,8 +107,8 @@ def test_check_values_warning(self, mock): "B": [23, 22, 24, 22, 25], "C": [66, 72, np.inf, 68, -np.inf], } - self.obj.mat = pd.DataFrame(data) - self.obj._check_matrix_values() + mat = pd.DataFrame(data) + self.obj._check_matrix_values(mat) mock.assert_called_once() @patch("logging.Logger.info") From 3abea3ecb64f6f5a6597ea38630652e2169075d1 Mon Sep 17 00:00:00 2001 From: mschwoerer <82171591+mschwoer@users.noreply.github.com> Date: Wed, 18 Sep 2024 18:29:34 +0200 Subject: [PATCH 2/6] introduce datasetfactory --- alphastats/DataSet.py | 106 ---------------------------- alphastats/dataset_factory.py | 127 ++++++++++++++++++++++++++++++++++ 2 files changed, 127 insertions(+), 106 deletions(-) create mode 100644 alphastats/dataset_factory.py diff --git a/alphastats/DataSet.py b/alphastats/DataSet.py index 112a7a35..abcc6d5f 100644 --- a/alphastats/DataSet.py +++ b/alphastats/DataSet.py @@ -482,109 +482,3 @@ def plot_dendrogram( ): """A wrapper for Plot.plot_dendrogram(), see documentation there.""" return self._get_plot().plot_dendrogram(linkagefun) - - def _check_loader(self, loader): - """Checks if the Loader is from class AlphaPeptLoader, MaxQuantLoader, DIANNLoader, FragPipeLoader - - Args: - loader : loader - """ - if not isinstance(loader, BaseLoader): - raise LoaderError( - "loader must be a subclass of BaseLoader, " - f"got {loader.__class__.__name__}" - ) - - if not isinstance(loader.rawinput, pd.DataFrame) or loader.rawinput.empty: - raise ValueError( - "Error in rawinput, consider reloading your data with: AlphaPeptLoader, MaxQuantLoader, DIANNLoader, FragPipeLoader, SpectronautLoader" - ) - - if not isinstance(loader.index_column, str): - raise ValueError( - "Invalid index_column: consider reloading your data with: AlphaPeptLoader, MaxQuantLoader, DIANNLoader, FragPipeLoader, SpectronautLoader" - ) - - @staticmethod - def _check_matrix_values(mat: pd.DataFrame) -> None: - if np.isinf(mat).values.sum() > 0: - logging.warning("Data contains infinite values.") - - def _remove_misc_samples_in_metadata(self, metadata: pd.DataFrame) -> pd.DataFrame: - samples_matrix = self.mat.index.to_list() - samples_metadata = metadata[self.sample].to_list() - misc_samples = list(set(samples_metadata) - set(samples_matrix)) - if len(misc_samples) > 0: - metadata = metadata[~metadata[self.sample].isin(misc_samples)] - logging.warning( - f"{misc_samples} are not described in the protein data and" - "are removed from the metadata." - ) - return metadata - - def _create_matrix_from_rawinput(self) -> Tuple[pd.DataFrame, pd.DataFrame]: - """ - Creates a matrix of the Outputfile, with columns displaying features (Proteins) and - rows the samples. - """ - - df = self.rawinput - df = df.set_index(self.index_column) - - if isinstance(self._intensity_column, str): - regex_find_intensity_columns = self._intensity_column.replace( - "[sample]", ".*" - ) - df = df.filter(regex=regex_find_intensity_columns, axis=1) - # remove Intensity so only sample names remain - substring_to_remove = regex_find_intensity_columns.replace(".*", "") - df.columns = df.columns.str.replace(substring_to_remove, "") - - else: - df = df[self._intensity_column] - - rawmat = df.transpose() - rawmat.replace([np.inf, -np.inf], np.nan, inplace=True) - - # remove proteins with only zero # TODO this is re-done in preprocessing - mat_no_zeros = rawmat.loc[:, (rawmat != 0).any(axis=0)] - - return rawmat, mat_no_zeros.astype(float) - - def _load_metadata( - self, file_path: Union[pd.DataFrame, str] - ) -> Optional[pd.DataFrame]: - """Load metadata either xlsx, txt, csv or txt file - - Args: - file_path: path to metadata file or metadata DataFrame # TODO disentangle this - """ - if isinstance(file_path, pd.DataFrame): - df = file_path - elif file_path.endswith(".xlsx"): - warnings.filterwarnings( - "ignore", - category=UserWarning, - module="openpyxl", - # message=r"/extension is not supported and will be removed/", # this somehow does not work here? - ) - df = pd.read_excel(file_path) - # find robust way to detect file format - # else give file separation as variable - elif file_path.endswith(".txt") or file_path.endswith(".tsv"): - df = pd.read_csv(file_path, delimiter="\t") - elif file_path.endswith(".csv"): - df = pd.read_csv(file_path) - else: - logging.warning( - "WARNING: Metadata could not be read. \nMetadata has to be a .xslx, .tsv, .csv or .txt file" - ) - return None - - if df is not None and self.sample not in df.columns: - logging.error(f"sample_column: {self.sample} not found in {file_path}") - - # check whether sample labeling matches protein data - # warnings.warn("WARNING: Sample names do not match sample labelling in protein data") - df.columns = df.columns.astype(str) - return df diff --git a/alphastats/dataset_factory.py b/alphastats/dataset_factory.py new file mode 100644 index 00000000..7e619979 --- /dev/null +++ b/alphastats/dataset_factory.py @@ -0,0 +1,127 @@ +from typing import List, Union, Dict, Optional, Tuple + +import pandas as pd +import numpy as np +import logging +import warnings + + +class DataSetFactory: + """Create all 'heavy' data structures of a DataSet.""" + + def __init__( + self, + *, + rawinput: pd.DataFrame, + index_column: str, + intensity_column: Union[List[str], str], + metadata_path: Union[str, pd.DataFrame], + sample_column: str, + ): + self.rawinput: pd.DataFrame = rawinput + self.sample_column: str = sample_column + self.index_column: str = index_column + self.intensity_column: Union[List[str], str] = intensity_column + self.metadata_path: Union[str, pd.DataFrame] = metadata_path + + def create_matrix_from_rawinput(self) -> Tuple[pd.DataFrame, pd.DataFrame]: + """Creates a matrix: features (Proteins) as columns, samples as rows.""" + + df = self.rawinput + df = df.set_index(self.index_column) + + if isinstance(self.intensity_column, str): + regex_find_intensity_columns = self.intensity_column.replace( + "[sample]", ".*" + ) + df = df.filter(regex=regex_find_intensity_columns, axis=1) + # remove Intensity so only sample names remain + substring_to_remove = regex_find_intensity_columns.replace(".*", "") + df.columns = df.columns.str.replace(substring_to_remove, "") + + else: + df = df[self.intensity_column] + + rawmat = df.transpose() + rawmat.replace([np.inf, -np.inf], np.nan, inplace=True) + + # remove proteins with only zero # TODO this is re-done in preprocessing + mat_no_zeros = rawmat.loc[:, (rawmat != 0).any(axis=0)].astype(float) + + self._check_matrix_values(mat_no_zeros) + + return rawmat, mat_no_zeros + + @staticmethod + def _check_matrix_values(mat: pd.DataFrame) -> None: + """Check for infinite values in the matrix.""" + if np.isinf(mat).values.sum() > 0: + logging.warning("Data contains infinite values.") + + def create_metadata(self, mat: pd.DataFrame) -> Tuple[pd.DataFrame, str]: + """Create metadata DataFrame from metadata file or DataFrame.""" + + if self.metadata_path is not None: + sample = self.sample_column + metadata = self._load_metadata(file_path=self.metadata_path) + metadata = self._remove_missing_samples_from_metadata(mat, metadata, sample) + else: + sample = "sample" + metadata = pd.DataFrame({"sample": list(mat.index)}) + + return metadata, sample + + def _remove_missing_samples_from_metadata( + self, mat: pd.DataFrame, metadata: pd.DataFrame, sample + ) -> pd.DataFrame: + """Remove samples from metadata that are not in the protein data.""" + samples_matrix = mat.index.to_list() + samples_metadata = metadata[sample].to_list() + misc_samples = list(set(samples_metadata) - set(samples_matrix)) + if len(misc_samples) > 0: + metadata = metadata[~metadata[sample].isin(misc_samples)] + logging.warning( + f"{misc_samples} are not described in the protein data and" + "are removed from the metadata." + ) + return metadata + + def _load_metadata( + self, file_path: Union[pd.DataFrame, str] + ) -> Optional[pd.DataFrame]: + """Load metadata either xlsx, txt, csv or txt file + + Args: + file_path: path to metadata file or metadata DataFrame # TODO disentangle this + """ + if isinstance(file_path, pd.DataFrame): + df = file_path + elif file_path.endswith(".xlsx"): + warnings.filterwarnings( + "ignore", + category=UserWarning, + module="openpyxl", + # message=r"/extension is not supported and will be removed/", # this somehow does not work here? + ) + df = pd.read_excel(file_path) + # find robust way to detect file format + # else give file separation as variable + elif file_path.endswith(".txt") or file_path.endswith(".tsv"): + df = pd.read_csv(file_path, delimiter="\t") + elif file_path.endswith(".csv"): + df = pd.read_csv(file_path) + else: + logging.warning( + "WARNING: Metadata could not be read. \nMetadata has to be a .xslx, .tsv, .csv or .txt file" + ) + return None + + if df is not None and self.sample_column not in df.columns: + logging.error( + f"sample_column: {self.sample_column} not found in {file_path}" + ) + + # check whether sample labeling matches protein data + # warnings.warn("WARNING: Sample names do not match sample labelling in protein data") + df.columns = df.columns.astype(str) + return df From 5446d0e3804bd1199b71c94738317aeb65a199ce Mon Sep 17 00:00:00 2001 From: mschwoerer <82171591+mschwoer@users.noreply.github.com> Date: Wed, 18 Sep 2024 18:34:05 +0200 Subject: [PATCH 3/6] use datasetfactory --- alphastats/DataSet.py | 111 ++++++++++++++++++++++++++---------------- 1 file changed, 69 insertions(+), 42 deletions(-) diff --git a/alphastats/DataSet.py b/alphastats/DataSet.py index abcc6d5f..04d88e52 100644 --- a/alphastats/DataSet.py +++ b/alphastats/DataSet.py @@ -20,6 +20,7 @@ from alphastats.plots.IntensityPlot import IntensityPlot from alphastats.plots.SampleHistogram import SampleHistogram from alphastats.plots.VolcanoPlot import VolcanoPlot +from alphastats.dataset_factory import DataSetFactory plotly.io.templates["alphastats_colors"] = plotly.graph_objects.layout.Template( layout=plotly.graph_objects.Layout( @@ -48,14 +49,14 @@ class DataSet: def __init__( self, loader: BaseLoader, - metadata_path: Optional[str] = None, + metadata_path: Optional[Union[str, pd.DataFrame]] = None, sample_column: Optional[str] = None, ): """Create DataSet Args: loader (_type_): loader of class AlphaPeptLoader, MaxQuantLoader, DIANNLoader, FragPipeLoader, SpectronautLoader - metadata_path (str, optional): path to metadata file. Defaults to None. + metadata_path (str or pd.DataFrame, optional): path to metadata file or an actual df. Defaults to None. sample_column (str, optional): column in metadata file indicating the sample IDs. Defaults to None. Attributes of a DataSet instance: @@ -72,46 +73,75 @@ def __init__( self.index_column: str = loader.index_column self.software: str = loader.software self._gene_names: str = loader.gene_names - # TODO this is used when creating the matrix, but then overwritten for Generic loaders later? - self._intensity_column: Union[str, list] = loader.intensity_column - # self.evidence_df: pd.DataFrame = loader.evidence_df # TODO unused - - # create matrix - self.rawmat: pd.DataFrame - self.mat: pd.DataFrame - self.rawmat, self.mat = self._create_matrix_from_rawinput() - self._check_matrix_values(self.mat) - - # create metadata - self.metadata: pd.DataFrame - self.sample: str - if metadata_path is not None: - self.sample = sample_column - metadata = self._load_metadata(file_path=metadata_path) - self.metadata = self._remove_misc_samples_in_metadata(metadata) - else: - self.sample = "sample" - self.metadata = pd.DataFrame({"sample": list(self.mat.index)}) - - if loader == "Generic": - self._intensity_column = loader._extract_sample_names( + self._intensity_column: Union[str, list] = ( + loader._extract_sample_names( metadata=self.metadata, sample_column=self.sample ) + if loader == "Generic" + else loader.intensity_column + ) + + # self.evidence_df: pd.DataFrame = loader.evidence_df # TODO unused - self.preprocessing_info: Dict = Preprocess.init_preprocessing_info( - num_samples=self.mat.shape[0], - num_protein_groups=self.mat.shape[1], + self.dataset_factory = DataSetFactory( + rawinput=self.rawinput, + index_column=self.index_column, intensity_column=self._intensity_column, - filter_columns=self.filter_columns, + metadata_path=metadata_path, + sample_column=sample_column, ) - self._preprocessed: bool = ( - False # TODO could be moved to preprocessing_info dict + rawmat, mat, metadata, sample, preprocessing_info, preprocessed = ( + self._init_dataset() ) + self.rawmat: pd.DataFrame = rawmat + self.mat: pd.DataFrame = mat + self.metadata: pd.DataFrame = metadata + self.sample: str = sample + self.preprocessing_info: Dict = preprocessing_info + self._preprocessed: bool = preprocessed print("DataSet has been created.") + def _init_dataset(self): + rawmat, mat = self.dataset_factory.create_matrix_from_rawinput() + + metadata, sample = self.dataset_factory.create_metadata(mat) + + preprocessing_info = Preprocess.init_preprocessing_info( + num_samples=mat.shape[0], + num_protein_groups=mat.shape[1], + intensity_column=self._intensity_column, + filter_columns=self.filter_columns, + ) + + preprocessed = False # TODO could be moved to preprocessing_info dict + + return rawmat, mat, metadata, sample, preprocessing_info, preprocessed + + def _check_loader(self, loader): + """Checks if the Loader is from class AlphaPeptLoader, MaxQuantLoader, DIANNLoader, FragPipeLoader + + Args: + loader : loader + """ + if not isinstance(loader, BaseLoader): + raise LoaderError( + "loader must be a subclass of BaseLoader, " + f"got {loader.__class__.__name__}" + ) + + if not isinstance(loader.rawinput, pd.DataFrame) or loader.rawinput.empty: + raise ValueError( + "Error in rawinput, consider reloading your data with: AlphaPeptLoader, MaxQuantLoader, DIANNLoader, FragPipeLoader, SpectronautLoader" + ) + + if not isinstance(loader.index_column, str): + raise ValueError( + "Invalid index_column: consider reloading your data with: AlphaPeptLoader, MaxQuantLoader, DIANNLoader, FragPipeLoader, SpectronautLoader" + ) + def _get_preprocess(self) -> Preprocess: """Return instance of the Preprocess object.""" return Preprocess( @@ -152,17 +182,14 @@ def preprocess( def reset_preprocessing(self): """Reset all preprocessing steps""" - self.rawmat, self.mat = self._create_matrix_from_rawinput() - self.preprocessing_info = Preprocess.init_preprocessing_info( - num_samples=self.mat.shape[0], - num_protein_groups=self.mat.shape[1], - intensity_column=self._intensity_column, - filter_columns=self.filter_columns, - ) - - self._preprocessed = False - # TODO fix bug: metadata is not reset/reloaded here - print("All preprocessing steps are reset.") + ( + self.rawmat, + self.mat, + self.metadata, + self.sample, + self.preprocessing_info, + self._preprocessed, + ) = self._init_dataset() def batch_correction(self, batch: str) -> None: """A wrapper for Preprocess.batch_correction(), see documentation there.""" From ac311c1ed1e7624e965f6b642bdf27c41c602eb7 Mon Sep 17 00:00:00 2001 From: mschwoerer <82171591+mschwoer@users.noreply.github.com> Date: Wed, 18 Sep 2024 18:34:17 +0200 Subject: [PATCH 4/6] fix tests --- alphastats/plots/VolcanoPlot.py | 1 + tests/test_DataSet.py | 17 ++++++++--------- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/alphastats/plots/VolcanoPlot.py b/alphastats/plots/VolcanoPlot.py index 0b9887a9..a4e78ef1 100644 --- a/alphastats/plots/VolcanoPlot.py +++ b/alphastats/plots/VolcanoPlot.py @@ -12,6 +12,7 @@ import plotly from functools import lru_cache +# TODO this is repeated and needs to go elsewhere! plotly.io.templates["alphastats_colors"] = plotly.graph_objects.layout.Template( layout=plotly.graph_objects.Layout( paper_bgcolor="rgba(0,0,0,0)", diff --git a/tests/test_DataSet.py b/tests/test_DataSet.py index 73156c99..551b20ae 100644 --- a/tests/test_DataSet.py +++ b/tests/test_DataSet.py @@ -19,9 +19,8 @@ from alphastats.loader.GenericLoader import GenericLoader from alphastats.DataSet import DataSet -from alphastats.DataSet_Statistics import Statistics from alphastats.utils import LoaderError - +from alphastats.dataset_factory import DataSetFactory logger = logging.getLogger(__name__) @@ -79,15 +78,15 @@ def test_load_metadata(self): def test_load_metadata_missing_sample_column(self, mock): # is error raised when name of sample column is missing path = self.metadata_path - self.obj.sample = "wrong_sample_column" - self.obj._load_metadata(file_path=path) + self.obj.dataset_factory.sample_column = "wrong_sample_column" + self.obj.dataset_factory._load_metadata(file_path=path) mock.assert_called_once() @patch("logging.Logger.warning") def test_load_metadata_warning(self, mock): # is dataframe None and is warning produced file_path = "wrong/file.xxx" - self.obj._load_metadata(file_path=file_path) + self.obj.dataset_factory._load_metadata(file_path=file_path) mock.assert_called_once() def test_create_matrix(self): @@ -108,7 +107,7 @@ def test_check_values_warning(self, mock): "C": [66, 72, np.inf, 68, -np.inf], } mat = pd.DataFrame(data) - self.obj._check_matrix_values(mat) + DataSetFactory._check_matrix_values(mat) mock.assert_called_once() @patch("logging.Logger.info") @@ -221,15 +220,15 @@ def test_dataset_without_metadata(self): def test_load_metadata_fileformats(self): # test if different fileformats get loaded correctly metadata_path = "testfiles/alphapept/metadata.txt" - self.obj._load_metadata(file_path=metadata_path) + self.obj.dataset_factory._load_metadata(file_path=metadata_path) self.assertEqual(self.obj.metadata.shape, (2, 2)) metadata_path = "testfiles/alphapept/metadata.tsv" - self.obj._load_metadata(file_path=metadata_path) + self.obj.dataset_factory._load_metadata(file_path=metadata_path) self.assertEqual(self.obj.metadata.shape, (2, 2)) metadata_path = "testfiles/alphapept/metadata.csv" - self.obj._load_metadata(file_path=metadata_path) + self.obj.dataset_factory._load_metadata(file_path=metadata_path) self.assertEqual(self.obj.metadata.shape, (2, 2)) @patch("logging.Logger.warning") From c5cce03943ba16b94e8d49a9478651960e782a78 Mon Sep 17 00:00:00 2001 From: mschwoerer <82171591+mschwoer@users.noreply.github.com> Date: Fri, 20 Sep 2024 08:07:28 +0200 Subject: [PATCH 5/6] make dataset factory private --- alphastats/DataSet.py | 15 +++++++++------ tests/test_DataSet.py | 12 ++++++------ 2 files changed, 15 insertions(+), 12 deletions(-) diff --git a/alphastats/DataSet.py b/alphastats/DataSet.py index 04d88e52..8f6a5432 100644 --- a/alphastats/DataSet.py +++ b/alphastats/DataSet.py @@ -84,7 +84,7 @@ def __init__( # self.evidence_df: pd.DataFrame = loader.evidence_df # TODO unused - self.dataset_factory = DataSetFactory( + self._dataset_factory = DataSetFactory( rawinput=self.rawinput, index_column=self.index_column, intensity_column=self._intensity_column, @@ -93,7 +93,7 @@ def __init__( ) rawmat, mat, metadata, sample, preprocessing_info, preprocessed = ( - self._init_dataset() + self._get_init_dataset() ) self.rawmat: pd.DataFrame = rawmat self.mat: pd.DataFrame = mat @@ -104,10 +104,13 @@ def __init__( print("DataSet has been created.") - def _init_dataset(self): - rawmat, mat = self.dataset_factory.create_matrix_from_rawinput() + def _get_init_dataset( + self, + ) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, str, Dict, bool]: + """Get the initial data structure for the DataSet.""" + rawmat, mat = self._dataset_factory.create_matrix_from_rawinput() - metadata, sample = self.dataset_factory.create_metadata(mat) + metadata, sample = self._dataset_factory.create_metadata(mat) preprocessing_info = Preprocess.init_preprocessing_info( num_samples=mat.shape[0], @@ -189,7 +192,7 @@ def reset_preprocessing(self): self.sample, self.preprocessing_info, self._preprocessed, - ) = self._init_dataset() + ) = self._get_init_dataset() def batch_correction(self, batch: str) -> None: """A wrapper for Preprocess.batch_correction(), see documentation there.""" diff --git a/tests/test_DataSet.py b/tests/test_DataSet.py index 551b20ae..12545b2d 100644 --- a/tests/test_DataSet.py +++ b/tests/test_DataSet.py @@ -78,15 +78,15 @@ def test_load_metadata(self): def test_load_metadata_missing_sample_column(self, mock): # is error raised when name of sample column is missing path = self.metadata_path - self.obj.dataset_factory.sample_column = "wrong_sample_column" - self.obj.dataset_factory._load_metadata(file_path=path) + self.obj._dataset_factory.sample_column = "wrong_sample_column" + self.obj._dataset_factory._load_metadata(file_path=path) mock.assert_called_once() @patch("logging.Logger.warning") def test_load_metadata_warning(self, mock): # is dataframe None and is warning produced file_path = "wrong/file.xxx" - self.obj.dataset_factory._load_metadata(file_path=file_path) + self.obj._dataset_factory._load_metadata(file_path=file_path) mock.assert_called_once() def test_create_matrix(self): @@ -220,15 +220,15 @@ def test_dataset_without_metadata(self): def test_load_metadata_fileformats(self): # test if different fileformats get loaded correctly metadata_path = "testfiles/alphapept/metadata.txt" - self.obj.dataset_factory._load_metadata(file_path=metadata_path) + self.obj._dataset_factory._load_metadata(file_path=metadata_path) self.assertEqual(self.obj.metadata.shape, (2, 2)) metadata_path = "testfiles/alphapept/metadata.tsv" - self.obj.dataset_factory._load_metadata(file_path=metadata_path) + self.obj._dataset_factory._load_metadata(file_path=metadata_path) self.assertEqual(self.obj.metadata.shape, (2, 2)) metadata_path = "testfiles/alphapept/metadata.csv" - self.obj.dataset_factory._load_metadata(file_path=metadata_path) + self.obj._dataset_factory._load_metadata(file_path=metadata_path) self.assertEqual(self.obj.metadata.shape, (2, 2)) @patch("logging.Logger.warning") From 441002e281856ca7252b4b7ac78271b553860854 Mon Sep 17 00:00:00 2001 From: mschwoerer <82171591+mschwoer@users.noreply.github.com> Date: Fri, 20 Sep 2024 09:03:15 +0200 Subject: [PATCH 6/6] move dedicated "preprocessed" flag to dict --- alphastats/DataSet.py | 13 +++---------- alphastats/DataSet_Preprocess.py | 6 ++++++ 2 files changed, 9 insertions(+), 10 deletions(-) diff --git a/alphastats/DataSet.py b/alphastats/DataSet.py index 8f6a5432..7ee4484d 100644 --- a/alphastats/DataSet.py +++ b/alphastats/DataSet.py @@ -92,21 +92,18 @@ def __init__( sample_column=sample_column, ) - rawmat, mat, metadata, sample, preprocessing_info, preprocessed = ( - self._get_init_dataset() - ) + rawmat, mat, metadata, sample, preprocessing_info = self._get_init_dataset() self.rawmat: pd.DataFrame = rawmat self.mat: pd.DataFrame = mat self.metadata: pd.DataFrame = metadata self.sample: str = sample self.preprocessing_info: Dict = preprocessing_info - self._preprocessed: bool = preprocessed print("DataSet has been created.") def _get_init_dataset( self, - ) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, str, Dict, bool]: + ) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, str, Dict]: """Get the initial data structure for the DataSet.""" rawmat, mat = self._dataset_factory.create_matrix_from_rawinput() @@ -119,9 +116,7 @@ def _get_init_dataset( filter_columns=self.filter_columns, ) - preprocessed = False # TODO could be moved to preprocessing_info dict - - return rawmat, mat, metadata, sample, preprocessing_info, preprocessed + return rawmat, mat, metadata, sample, preprocessing_info def _check_loader(self, loader): """Checks if the Loader is from class AlphaPeptLoader, MaxQuantLoader, DIANNLoader, FragPipeLoader @@ -181,7 +176,6 @@ def preprocess( **kwargs, ) ) - self._preprocessed = True def reset_preprocessing(self): """Reset all preprocessing steps""" @@ -191,7 +185,6 @@ def reset_preprocessing(self): self.metadata, self.sample, self.preprocessing_info, - self._preprocessed, ) = self._get_init_dataset() def batch_correction(self, batch: str) -> None: diff --git a/alphastats/DataSet_Preprocess.py b/alphastats/DataSet_Preprocess.py index 56cb3902..6fd0d3bf 100644 --- a/alphastats/DataSet_Preprocess.py +++ b/alphastats/DataSet_Preprocess.py @@ -16,6 +16,9 @@ class PreprocessingStateKeys: """Keys for accessing the dictionary holding the information about preprocessing.""" + # TODO disentangle these keys from the human-readably display strings + PREPROCESSING_DONE = "Preprocessing done" + RAW_DATA_NUM_PG = "Raw data number of Protein Groups" NUM_PG = "Matrix= Number of ProteinIDs/ProteinGroups" NUM_SAMPLES = "Matrix= Number of samples" @@ -68,6 +71,7 @@ def init_preprocessing_info( ) -> Dict: """Initialize preprocessing info.""" return { + PreprocessingStateKeys.PREPROCESSING_DONE: False, PreprocessingStateKeys.RAW_DATA_NUM_PG: num_protein_groups, PreprocessingStateKeys.NUM_PG: num_protein_groups, PreprocessingStateKeys.NUM_SAMPLES: num_samples, @@ -111,6 +115,7 @@ def _remove_na_values(self, cut_off): "Missing values have already been filtered. To apply another cutoff, reset preprocessing." ) return + cut = 1 - cut_off num_samples, num_proteins = self.mat.shape @@ -438,6 +443,7 @@ def preprocess( self.preprocessing_info.update( { + PreprocessingStateKeys.PREPROCESSING_DONE: True, PreprocessingStateKeys.NUM_PG: self.mat.shape[1], } )