|
| 1 | +from dataclasses import dataclass |
| 2 | +from typing import Dict, Tuple, List |
| 3 | +import pandas as pd |
| 4 | + |
| 5 | + |
| 6 | +@dataclass |
| 7 | +class ComponentPayload: |
| 8 | + """ |
| 9 | + A class that represents a container for payload (dataframe and metadata) passed between pipline components. |
| 10 | + """ |
| 11 | + metadata: dict |
| 12 | + df: pd.DataFrame |
| 13 | + |
| 14 | + def __init__(self, input_path: str = '', metadata: Dict = None, df: pd.DataFrame = None): |
| 15 | + """ |
| 16 | + Initializes the ComponentPayload class with the given input_path, metadata and dataframe. |
| 17 | +
|
| 18 | + :param input_path: the input path of the data |
| 19 | + :type input_path: str |
| 20 | + :param metadata: the metadata of the data |
| 21 | + :type metadata: Dict |
| 22 | + :param df: the dataframe containing the data |
| 23 | + :type df: pd.DataFrame |
| 24 | + """ |
| 25 | + self.metadata = metadata |
| 26 | + self.df = df |
| 27 | + if not self.metadata: |
| 28 | + self.metadata = {'input_path': '', 'paths_column': '', 'all_paths_columns': [], |
| 29 | + 'meta_columns': [], 'feature_columns': [], 'classification_columns': []} |
| 30 | + if input_path: |
| 31 | + self.metadata['input_path'] = input_path |
| 32 | + if ('input_path' not in self.metadata or self.metadata['input_path'] == '') and \ |
| 33 | + ('paths_column' not in self.metadata or self.metadata['paths_column'] == ''): |
| 34 | + raise AttributeError( |
| 35 | + "You must supply at least input_path or metadata['paths_column'] when initializing ComponentPayload") |
| 36 | + for col in ['all_paths_columns', 'meta_columns', 'feature_columns', 'classification_columns']: |
| 37 | + if col not in self.metadata: |
| 38 | + self.metadata[col] = [] |
| 39 | + if 'paths_column' in self.metadata and not self.metadata['all_paths_columns']: |
| 40 | + self.metadata['all_paths_columns'].append(self.metadata['paths_column']) |
| 41 | + if self.df is None: |
| 42 | + self.df = pd.DataFrame() |
| 43 | + |
| 44 | + def unpack(self) -> Tuple[Dict, pd.DataFrame]: |
| 45 | + """ |
| 46 | + Returns a tuple of payload's metadata and the dataframe. |
| 47 | +
|
| 48 | + :return: tuple of metadata and the dataframe |
| 49 | + :rtype: Tuple[Dict, pd.DataFrame] |
| 50 | + """ |
| 51 | + return self.metadata, self.df |
| 52 | + |
| 53 | + def get_columns(self, all_paths_columns=False, meta_columns=False): |
| 54 | + """ |
| 55 | + Returns the list of column names stored in metadata, filtered based on the input parameters. |
| 56 | +
|
| 57 | + :param all_paths_columns: whether to include all paths columns in the returned list |
| 58 | + :type all_paths_columns: bool |
| 59 | + :param meta_columns: whether to include meta columns in the returned list |
| 60 | + :type meta_columns: bool |
| 61 | + :return: list of column names |
| 62 | + :rtype: List[str] |
| 63 | + """ |
| 64 | + if not all_paths_columns: |
| 65 | + columns = [self.metadata['paths_column']] |
| 66 | + else: |
| 67 | + columns = self.metadata['all_paths_columns'] |
| 68 | + if meta_columns: |
| 69 | + columns.extend(self.metadata['meta_columns']) |
| 70 | + return columns |
| 71 | + |
| 72 | + def get_declared_columns(self, ext_columns: List[str], all_paths_columns=False, meta_columns=False): |
| 73 | + """ |
| 74 | + Returns a payload's dataframe containing the specified columns. |
| 75 | +
|
| 76 | + :param ext_columns: the list of columns to include in the returned dataframe |
| 77 | + :type ext_columns: List[str] |
| 78 | + :param all_paths_columns: whether to include all paths columns in the returned dataframe |
| 79 | + :type all_paths_columns: bool |
| 80 | + :param meta_columns: whether to include meta columns in the returned dataframe |
| 81 | + :type meta_columns: bool |
| 82 | + :return: a dataframe containing the specified columns |
| 83 | + :rtype: pd.DataFrame |
| 84 | + """ |
| 85 | + columns = self.get_columns(all_paths_columns, meta_columns) |
| 86 | + for cols in ext_columns: |
| 87 | + columns.extend(self.metadata[cols]) |
| 88 | + columns = list(set(columns) & set(self.df.columns)) |
| 89 | + return self.df[columns] |
| 90 | + |
| 91 | + def get_features_df(self, all_paths_columns=False, meta_columns=False): |
| 92 | + """ |
| 93 | + Returns a dataframe containing the feature columns of the payload. |
| 94 | +
|
| 95 | + :param all_paths_columns: whether to include all paths columns in the returned dataframe |
| 96 | + :type all_paths_columns: bool |
| 97 | + :param meta_columns: whether to include meta columns in the returned dataframe |
| 98 | + :type meta_columns: bool |
| 99 | + :return: a dataframe containing the feature columns |
| 100 | + :rtype: pd.DataFrame |
| 101 | + """ |
| 102 | + return self.get_declared_columns(['feature_columns'], all_paths_columns, meta_columns) |
| 103 | + |
| 104 | + def get_classification_df(self, all_paths_columns=False, meta_columns=False): |
| 105 | + """ |
| 106 | + Returns a dataframe containing the classification columns of the payload. |
| 107 | +
|
| 108 | + :param all_paths_columns: whether to include all paths columns in the returned dataframe |
| 109 | + :type all_paths_columns: bool |
| 110 | + :param meta_columns: whether to include meta columns in the returned dataframe |
| 111 | + :type meta_columns: bool |
| 112 | + :return: a dataframe containing the classification columns |
| 113 | + :rtype: pd.DataFrame |
| 114 | + """ |
| 115 | + return self.get_declared_columns(['classification_columns'], all_paths_columns, meta_columns) |
| 116 | + |
| 117 | + def get_full_df(self, all_paths_columns=False, meta_columns=False): |
| 118 | + """ |
| 119 | + Returns a dataframe containing the feature and classification columns of the payload. |
| 120 | +
|
| 121 | + :param all_paths_columns: whether to include all paths columns in the returned dataframe |
| 122 | + :type all_paths_columns: bool |
| 123 | + :param meta_columns: whether to include meta columns in the returned dataframe |
| 124 | + :type meta_columns: bool |
| 125 | + :return: a dataframe containing the feature and classification columns |
| 126 | + :rtype: pd.DataFrame |
| 127 | + """ |
| 128 | + return self.get_declared_columns(['feature_columns', 'classification_columns'], all_paths_columns, meta_columns) |
| 129 | + |
| 130 | + def remove_redundant_index_columns(self): |
| 131 | + """ |
| 132 | + Removes any columns from the payload's dataframe that have a name that starts with "Unnamed" or is an empty string. |
| 133 | + """ |
| 134 | + for c in self.df.columns: |
| 135 | + if c.startswith('Unnamed') or c == '': |
| 136 | + self.df.drop([c], axis=1, inplace=True) |
| 137 | + |
0 commit comments