From 3ffc95e341037fcd34df19d153c435b8587ea586 Mon Sep 17 00:00:00 2001 From: Devin Lu Date: Sun, 27 Mar 2022 14:05:40 -0700 Subject: [PATCH 1/6] feat(eda): added target analysis given numerical target column feat(eda): added basic numerical target analysis squash this feat(eda): added target analysis given numerical target column --- dataprep/eda/create_diff_report/__init__.py | 3 +- .../eda/create_diff_report/diff_formatter.py | 35 +++-- dataprep/eda/diff/render.py | 146 +++++++++++++++--- dataprep/eda/intermediate.py | 5 + 4 files changed, 160 insertions(+), 29 deletions(-) diff --git a/dataprep/eda/create_diff_report/__init__.py b/dataprep/eda/create_diff_report/__init__.py index e48d6f8a5..860be8bc2 100644 --- a/dataprep/eda/create_diff_report/__init__.py +++ b/dataprep/eda/create_diff_report/__init__.py @@ -22,6 +22,7 @@ def create_diff_report( df_list: Union[List[pd.DataFrame], Dict[str, pd.DataFrame]], + target: Optional[str] = None, config: Optional[Dict[str, Any]] = None, display: Optional[List[str]] = None, title: Optional[str] = "DataPrep Report", @@ -63,7 +64,7 @@ def create_diff_report( _suppress_warnings() cfg = Config.from_dict(display, config) - components = format_diff_report(df_list, cfg, mode, progress) + components = format_diff_report(df_list, cfg, mode, progress, target) dict_stats = defaultdict(list) diff --git a/dataprep/eda/create_diff_report/diff_formatter.py b/dataprep/eda/create_diff_report/diff_formatter.py index 0dcd7bd67..7c05559c6 100644 --- a/dataprep/eda/create_diff_report/diff_formatter.py +++ b/dataprep/eda/create_diff_report/diff_formatter.py @@ -68,6 +68,7 @@ def format_diff_report( cfg: Config, mode: Optional[str], progress: bool = True, + target: Optional[str] = None ) -> Dict[str, Any]: """ Format the data and figures needed by create_diff_report @@ -110,13 +111,26 @@ def format_diff_report( if mode == "basic": # note: we need the type ignore comment for mypy otherwise it complains because # it doesn't realize that we converted df_list to a list if it's a dictionary - report = format_basic(df_list, cfg) # type: ignore + if target: + validate_target(target, df_list) + report = format_basic(df_list, target, cfg) # type: ignore else: raise ValueError(f"Unknown mode: {mode}") return report +def validate_target(target: str, df_list: List[pd.DataFrame]): + """ + Helper function, verify that target column exists + """ + exists = False + for df in df_list: + if target in df.columns: + exists = True + break + if not exists: + raise ValueError(f'Sorry, {target} is not a valid column') -def format_basic(df_list: List[pd.DataFrame], cfg: Config) -> Dict[str, Any]: +def format_basic(df_list: List[pd.DataFrame], target: Optional[str], cfg: Config) -> Dict[str, Any]: """ Format basic version. @@ -158,7 +172,7 @@ def format_basic(df_list: List[pd.DataFrame], cfg: Config) -> Dict[str, Any]: # data = dask.compute(data) delayed_results.append(data) - res_plots = dask.delayed(_format_plots)(cfg=cfg, df_list=df_list) + res_plots = dask.delayed(_format_plots)(cfg=cfg, df_list=df_list, target=target) dask_results["df_computations"] = delayed_results dask_results["plots"] = res_plots @@ -211,7 +225,7 @@ def basic_computations(df: EDAFrame, cfg: Config) -> Dict[str, Any]: def compute_plot_data( - df_list: List[dd.DataFrame], cfg: Config, dtype: Optional[DTypeDef] + pd_list: List[pd.DataFrame], cfg: Config, dtype: Optional[DTypeDef], target: Optional[str] ) -> Intermediate: """ Compute function for create_diff_report's plots @@ -229,6 +243,10 @@ def compute_plot_data( """ # pylint: disable=too-many-branches, too-many-locals + df_list = list(map(to_dask, pd_list)) + for i, _ in enumerate(df_list): + df_list[i].columns = df_list[i].columns.astype(str) + dfs = Dfs(df_list) dfs_cols = dfs.columns.apply("to_list").data @@ -277,7 +295,7 @@ def compute_plot_data( elif is_dtype(dtp, DateTime_v1()): plot_data.append((col, dtp, dask.compute(*datum), orig)) # workaround - return Intermediate(data=plot_data, stats=stats, visual_type="comparison_grid") + return Intermediate(data=plot_data, stats=stats, visual_type="comparison_grid", target=target, df_list=pd_list) def _compute_variables(df: EDAFrame, cfg: Config) -> Dict[str, Any]: @@ -407,14 +425,11 @@ def _format_variables(df: EDAFrame, cfg: Config, data: Dict[str, Any]) -> Dict[s def _format_plots( - df_list: Union[List[pd.DataFrame], Dict[str, pd.DataFrame]], cfg: Config + df_list: Union[List[pd.DataFrame], Dict[str, pd.DataFrame]], cfg: Config, target: Optional[str] ) -> Dict[str, Any]: """Formatting of plots section""" - df_list = list(map(to_dask, df_list)) - for i, _ in enumerate(df_list): - df_list[i].columns = df_list[i].columns.astype(str) - itmdt = compute_plot_data(df_list=df_list, cfg=cfg, dtype=None) + itmdt = compute_plot_data(pd_list=df_list, cfg=cfg, dtype=None, target=target) return render_diff(itmdt, cfg=cfg) diff --git a/dataprep/eda/diff/render.py b/dataprep/eda/diff/render.py index 7a49bd679..da0923b6a 100644 --- a/dataprep/eda/diff/render.py +++ b/dataprep/eda/diff/render.py @@ -1,19 +1,24 @@ """ This module implements the visualization for the plot_diff function. """ # pylint: disable=too-many-lines +from turtle import color from typing import Any, Dict, List, Tuple, Optional - +from sklearn.preprocessing import MinMaxScaler import math import numpy as np import pandas as pd +import dask.array as da +import matplotlib.pyplot as plt from bokeh.models import ( HoverTool, Panel, FactorRange, ) -from bokeh.plotting import Figure, figure +from bokeh.plotting import Figure, figure, show from bokeh.transform import dodge from bokeh.layouts import row +from bokeh.models.ranges import Range1d +from bokeh.models import LinearAxis from ..configs import Config from ..dtypes import Continuous, DateTime, Nominal, is_dtype @@ -78,6 +83,8 @@ def bar_viz( orig: List[str], df_labels: List[str], baseline: int, + target: Optional[str] = None, + df_list: Optional[List[pd.DataFrame]] = None ) -> Figure: """ Render a bar chart @@ -94,6 +101,12 @@ def bar_viz( ("Source", "@orig"), ] + col1_min = df[0][col].min() + col2_min = df[1][col].min() + col1_max = df[0][col].max() + col2_max = df[1][col].max() + y_inc = 0.05 + if show_yticks: if len(df[baseline]) > 10: plot_width = 28 * len(df[baseline]) @@ -106,12 +119,15 @@ def bar_viz( tools="hover", x_range=list(df[baseline].index), y_axis_type=yscale, + y_range=(min(col1_min, col2_min) * (1 - y_inc), max(col1_max, col2_max) * (1 + y_inc)) ) - + row_names = None offset = np.linspace(-0.08 * len(df), 0.08 * len(df), len(df)) if len(df) > 1 else [0] for i, (nrow, data) in enumerate(zip(nrows, df)): data["pct"] = data[col] / nrow * 100 data.index = [str(val) for val in data.index] + if row_names is None: + row_names = data.index data["orig"] = orig[i] fig.vbar( @@ -126,7 +142,6 @@ def bar_viz( tweak_figure(fig, "bar", show_yticks) fig.yaxis.axis_label = "Count" - x_axis_label = "" if ttl_grps > len(df[baseline]): x_axis_label += f"Top {len(df[baseline])} of {ttl_grps} {col}" @@ -142,6 +157,21 @@ def bar_viz( if show_yticks and yscale == "linear": _format_axis(fig, 0, df[baseline].max(), "y") + + df1, df2 = df_list[0], df_list[1] + if target != col and target and col in df1.columns and col in df2.columns: + col1, col2 = df_list[0][col], df_list[1][col] + row_avgs_1 = [] + row_avgs_2 = [] + for names in row_names: + row_avgs_1.append(df_list[0][target][col1 == names].mean()) + row_avgs_2.append(df_list[1][target][col2 == names].mean()) + + row_avgs_1 = [0 if math.isnan(x) else x for x in row_avgs_1] + row_avgs_2 = [0 if math.isnan(x) else x for x in row_avgs_2] + fig.extra_y_ranges = {"Averages": Range1d(start=min(row_avgs_1 + row_avgs_2) * (1 - y_inc), end=max(row_avgs_1 + row_avgs_2) * (1 + y_inc))} + fig.multi_line([row_names, row_names], [row_avgs_1, row_avgs_2], color=['navy', 'firebrick'], y_range_name="Averages", line_width=4) + fig.add_layout(LinearAxis(y_range_name="Averages"), 'right') return fig @@ -155,28 +185,56 @@ def hist_viz( show_yticks: bool, df_labels: List[str], orig: Optional[List[str]] = None, + target: Optional[str] = None, + df_list: Optional[List[pd.DataFrame]] = None ) -> Figure: """ Render a histogram """ # pylint: disable=too-many-arguments,too-many-locals - tooltips = [ ("Bin", "@intvl"), ("Frequency", "@freq"), ("Percent", "@pct{0.2f}%"), ("Source", "@orig"), ] + df1, df2 = df_list[0], df_list[1] + y_inc = 0.05 + tooltips = [ + ("Bin", "@intvl"), + ("Frequency", "@freq"), + ("Percent", "@pct{0.2f}%"), + ("Source", "@orig"), + ] + fig = None + + y_start, y_end = None, None + counts_list = [] + if target and target != col and col in df1.columns and col in df2.columns: + for hst in hist: + counts, bins = hst + counts_list.append(counts) + + counts_min_1 = min(counts_list[0]) + counts_min_2 = min(counts_list[1]) + + counts_max_1 = max(counts_list[0]) + counts_max_2 = max(counts_list[1]) + + y_start, y_end = min(counts_min_1, counts_min_2), max(counts_max_1, counts_max_2) + + fig = Figure( plot_height=plot_height, plot_width=plot_width, title=col, toolbar_location=None, - y_axis_type=yscale, + y_axis_type=yscale ) - + bins_list = [] for i, hst in enumerate(hist): counts, bins = hst + bins_list.append(bins) if sum(counts) == 0: fig.rect(x=0, y=0, width=0, height=0) continue @@ -192,16 +250,34 @@ def hist_viz( } ) bottom = 0 if yscale == "linear" or df.empty else counts.min() / 2 - fig.quad( - source=df, - left="left", - right="right", - bottom=bottom, - alpha=0.5, - top="freq", - fill_color=CATEGORY10[i], - line_color=CATEGORY10[i], - ) + if y_start is not None and y_end is not None: + # fig.y_range = (y_start * (1 - y_inc), y_end * (1 + y_inc)) + fig.extra_y_ranges = {"Counts": Range1d(start=y_start * (1 - y_inc), end=y_end * (1 + y_inc))} + fig.quad( + source=df, + left="left", + right="right", + bottom=bottom, + alpha=0.5, + top="freq", + fill_color=CATEGORY10[i], + line_color=CATEGORY10[i], + y_range_name="Counts" + ) + else: + fig.quad( + source=df, + left="left", + right="right", + bottom=bottom, + alpha=0.5, + top="freq", + fill_color=CATEGORY10[i], + line_color=CATEGORY10[i] + ) + # if col == 'LotFrontage': + # breakpoint() + hover = HoverTool(tooltips=tooltips, attachment="vertical", mode="vline") fig.add_tools(hover) @@ -224,6 +300,34 @@ def hist_viz( fig.xaxis.axis_label = x_axis_label fig.xaxis.axis_label_standoff = 0 + if target and target != col and col in df1.columns and col in df2.columns: + col1, col2 = df1[col], df2[col] + source1, source2 = col1, col2 + col1 = col1[~np.isnan(col1)] + col2 = col2[~np.isnan(col2)] + num_bins1 = len(bins_list[0]) - 1 + num_bins2 = len(bins_list[1]) - 1 + bins_1, bins_2 = bins_list[0], bins_list[1] + + df1_source_bins_series = pd.cut(source1, bins=bins_1, labels=False) + df1_bin_averages = [None] * num_bins1 + + df2_source_bins_series = pd.cut(source2, bins=bins_2, labels=False) + df2_bin_averages = [None] * num_bins2 + + for b in range(num_bins1): + df1_bin_averages[b] = df1[target][df1_source_bins_series == b].mean() + for b in range(num_bins2): + df2_bin_averages[b] = df2[target][df2_source_bins_series == b].mean() + + df1_bin_averages = [0 if math.isnan(x) else x for x in df1_bin_averages] + df2_bin_averages = [0 if math.isnan(x) else x for x in df2_bin_averages] + max_range = max(df1_bin_averages + df2_bin_averages) + min_range = min(df1_bin_averages + df2_bin_averages) + + fig.extra_y_ranges['Averages'] = Range1d(start=min_range * (1 - y_inc), end=max_range * (1 + y_inc)) + fig.multi_line([bins_1, bins_2], [df1_bin_averages, df2_bin_averages], color=['navy', 'firebrick'], y_range_name="Averages", line_width=4) + fig.add_layout(LinearAxis(y_range_name="Averages", axis_label='Bin Averages'), 'right') return fig @@ -610,6 +714,9 @@ def render_comparison_grid(itmdt: Intermediate, cfg: Config) -> Dict[str, Any]: nrows = itmdt["stats"]["nrows"] titles: List[str] = [] + df_list = itmdt.df_list + target = itmdt.target + for col, dtp, data, orig in itmdt["data"]: fig = None if is_dtype(dtp, Nominal()): @@ -626,6 +733,8 @@ def render_comparison_grid(itmdt: Intermediate, cfg: Config) -> Dict[str, Any]: orig, df_labels, baseline if len(df) > 1 else 0, + target, + df_list ) elif is_dtype(dtp, Continuous()): if cfg.diff.density: @@ -643,6 +752,8 @@ def render_comparison_grid(itmdt: Intermediate, cfg: Config) -> Dict[str, Any]: False, df_labels, orig, + target, + df_list ) elif is_dtype(dtp, DateTime()): df, timeunit = data @@ -760,7 +871,6 @@ def render_diff(itmdt: Intermediate, cfg: Config) -> Dict[str, Any]: cfg Config instance """ - if itmdt.visual_type == "comparison_grid": visual_elem = render_comparison_grid(itmdt, cfg) if itmdt.visual_type == "comparison_continuous": diff --git a/dataprep/eda/intermediate.py b/dataprep/eda/intermediate.py index 331e4fb0b..35da501d1 100644 --- a/dataprep/eda/intermediate.py +++ b/dataprep/eda/intermediate.py @@ -30,6 +30,11 @@ def __init__(self, *args: Any, **kwargs: Any) -> None: visual_type = kwargs.pop("visual_type") super().__init__(**kwargs) self.visual_type = visual_type + if 'target' in kwargs: + self.target = kwargs.pop('target') + + if 'df_list' in kwargs: + self.df_list = kwargs.pop('df_list') else: raise ValueError("Unsupported initialization") From 03bc7b440ec91604b463d3b4940ec73893ede439 Mon Sep 17 00:00:00 2001 From: Devin Lu Date: Thu, 21 Apr 2022 16:36:36 -0700 Subject: [PATCH 2/6] refactor(eda): ran just ci --- dataprep/clean/clean_country.py | 4 +- .../eda/create_diff_report/diff_formatter.py | 10 ++- dataprep/eda/diff/render.py | 62 ++++++++++++------- dataprep/eda/intermediate.py | 10 +-- 4 files changed, 55 insertions(+), 31 deletions(-) diff --git a/dataprep/clean/clean_country.py b/dataprep/clean/clean_country.py index 3b749558b..c8c6ab0cd 100644 --- a/dataprep/clean/clean_country.py +++ b/dataprep/clean/clean_country.py @@ -271,7 +271,7 @@ def _format_country( return result, 2 if val != result else 3 -@lru_cache(maxsize=2**20) +@lru_cache(maxsize=2 ** 20) def _check_country(country: str, input_formats: Tuple[str, ...], strict: bool, clean: bool) -> Any: """ Finds the index of the given country in the DATA dataframe. @@ -322,7 +322,7 @@ def _check_country(country: str, input_formats: Tuple[str, ...], strict: bool, c return (None, "unknown") if clean else False -@lru_cache(maxsize=2**20) +@lru_cache(maxsize=2 ** 20) def _check_fuzzy_dist(country: str, fuzzy_dist: int) -> Any: """ A match is found if a country has an edit distance <= fuzzy_dist diff --git a/dataprep/eda/create_diff_report/diff_formatter.py b/dataprep/eda/create_diff_report/diff_formatter.py index 7c05559c6..a51248cc0 100644 --- a/dataprep/eda/create_diff_report/diff_formatter.py +++ b/dataprep/eda/create_diff_report/diff_formatter.py @@ -68,7 +68,7 @@ def format_diff_report( cfg: Config, mode: Optional[str], progress: bool = True, - target: Optional[str] = None + target: Optional[str] = None, ) -> Dict[str, Any]: """ Format the data and figures needed by create_diff_report @@ -118,6 +118,7 @@ def format_diff_report( raise ValueError(f"Unknown mode: {mode}") return report + def validate_target(target: str, df_list: List[pd.DataFrame]): """ Helper function, verify that target column exists @@ -128,7 +129,8 @@ def validate_target(target: str, df_list: List[pd.DataFrame]): exists = True break if not exists: - raise ValueError(f'Sorry, {target} is not a valid column') + raise ValueError(f"Sorry, {target} is not a valid column") + def format_basic(df_list: List[pd.DataFrame], target: Optional[str], cfg: Config) -> Dict[str, Any]: """ @@ -295,7 +297,9 @@ def compute_plot_data( elif is_dtype(dtp, DateTime_v1()): plot_data.append((col, dtp, dask.compute(*datum), orig)) # workaround - return Intermediate(data=plot_data, stats=stats, visual_type="comparison_grid", target=target, df_list=pd_list) + return Intermediate( + data=plot_data, stats=stats, visual_type="comparison_grid", target=target, df_list=pd_list + ) def _compute_variables(df: EDAFrame, cfg: Config) -> Dict[str, Any]: diff --git a/dataprep/eda/diff/render.py b/dataprep/eda/diff/render.py index da0923b6a..795d4c85a 100644 --- a/dataprep/eda/diff/render.py +++ b/dataprep/eda/diff/render.py @@ -84,7 +84,7 @@ def bar_viz( df_labels: List[str], baseline: int, target: Optional[str] = None, - df_list: Optional[List[pd.DataFrame]] = None + df_list: Optional[List[pd.DataFrame]] = None, ) -> Figure: """ Render a bar chart @@ -119,7 +119,7 @@ def bar_viz( tools="hover", x_range=list(df[baseline].index), y_axis_type=yscale, - y_range=(min(col1_min, col2_min) * (1 - y_inc), max(col1_max, col2_max) * (1 + y_inc)) + y_range=(min(col1_min, col2_min) * (1 - y_inc), max(col1_max, col2_max) * (1 + y_inc)), ) row_names = None offset = np.linspace(-0.08 * len(df), 0.08 * len(df), len(df)) if len(df) > 1 else [0] @@ -157,7 +157,7 @@ def bar_viz( if show_yticks and yscale == "linear": _format_axis(fig, 0, df[baseline].max(), "y") - + df1, df2 = df_list[0], df_list[1] if target != col and target and col in df1.columns and col in df2.columns: col1, col2 = df_list[0][col], df_list[1][col] @@ -166,12 +166,23 @@ def bar_viz( for names in row_names: row_avgs_1.append(df_list[0][target][col1 == names].mean()) row_avgs_2.append(df_list[1][target][col2 == names].mean()) - + row_avgs_1 = [0 if math.isnan(x) else x for x in row_avgs_1] row_avgs_2 = [0 if math.isnan(x) else x for x in row_avgs_2] - fig.extra_y_ranges = {"Averages": Range1d(start=min(row_avgs_1 + row_avgs_2) * (1 - y_inc), end=max(row_avgs_1 + row_avgs_2) * (1 + y_inc))} - fig.multi_line([row_names, row_names], [row_avgs_1, row_avgs_2], color=['navy', 'firebrick'], y_range_name="Averages", line_width=4) - fig.add_layout(LinearAxis(y_range_name="Averages"), 'right') + fig.extra_y_ranges = { + "Averages": Range1d( + start=min(row_avgs_1 + row_avgs_2) * (1 - y_inc), + end=max(row_avgs_1 + row_avgs_2) * (1 + y_inc), + ) + } + fig.multi_line( + [row_names, row_names], + [row_avgs_1, row_avgs_2], + color=["navy", "firebrick"], + y_range_name="Averages", + line_width=4, + ) + fig.add_layout(LinearAxis(y_range_name="Averages"), "right") return fig @@ -186,7 +197,7 @@ def hist_viz( df_labels: List[str], orig: Optional[List[str]] = None, target: Optional[str] = None, - df_list: Optional[List[pd.DataFrame]] = None + df_list: Optional[List[pd.DataFrame]] = None, ) -> Figure: """ Render a histogram @@ -222,14 +233,13 @@ def hist_viz( counts_max_2 = max(counts_list[1]) y_start, y_end = min(counts_min_1, counts_min_2), max(counts_max_1, counts_max_2) - fig = Figure( plot_height=plot_height, plot_width=plot_width, title=col, toolbar_location=None, - y_axis_type=yscale + y_axis_type=yscale, ) bins_list = [] for i, hst in enumerate(hist): @@ -252,7 +262,9 @@ def hist_viz( bottom = 0 if yscale == "linear" or df.empty else counts.min() / 2 if y_start is not None and y_end is not None: # fig.y_range = (y_start * (1 - y_inc), y_end * (1 + y_inc)) - fig.extra_y_ranges = {"Counts": Range1d(start=y_start * (1 - y_inc), end=y_end * (1 + y_inc))} + fig.extra_y_ranges = { + "Counts": Range1d(start=y_start * (1 - y_inc), end=y_end * (1 + y_inc)) + } fig.quad( source=df, left="left", @@ -262,7 +274,7 @@ def hist_viz( top="freq", fill_color=CATEGORY10[i], line_color=CATEGORY10[i], - y_range_name="Counts" + y_range_name="Counts", ) else: fig.quad( @@ -273,11 +285,11 @@ def hist_viz( alpha=0.5, top="freq", fill_color=CATEGORY10[i], - line_color=CATEGORY10[i] + line_color=CATEGORY10[i], ) # if col == 'LotFrontage': - # breakpoint() - + # breakpoint() + hover = HoverTool(tooltips=tooltips, attachment="vertical", mode="vline") fig.add_tools(hover) @@ -325,9 +337,17 @@ def hist_viz( max_range = max(df1_bin_averages + df2_bin_averages) min_range = min(df1_bin_averages + df2_bin_averages) - fig.extra_y_ranges['Averages'] = Range1d(start=min_range * (1 - y_inc), end=max_range * (1 + y_inc)) - fig.multi_line([bins_1, bins_2], [df1_bin_averages, df2_bin_averages], color=['navy', 'firebrick'], y_range_name="Averages", line_width=4) - fig.add_layout(LinearAxis(y_range_name="Averages", axis_label='Bin Averages'), 'right') + fig.extra_y_ranges["Averages"] = Range1d( + start=min_range * (1 - y_inc), end=max_range * (1 + y_inc) + ) + fig.multi_line( + [bins_1, bins_2], + [df1_bin_averages, df2_bin_averages], + color=["navy", "firebrick"], + y_range_name="Averages", + line_width=4, + ) + fig.add_layout(LinearAxis(y_range_name="Averages", axis_label="Bin Averages"), "right") return fig @@ -678,7 +698,7 @@ def format_num_stats(data: Dict[str, List[Any]]) -> Dict[str, Dict[str, List[Any descriptive = { "Mean": data["mean"], "Standard Deviation": data["std"], - "Variance": [std**2 for std in data["std"]], + "Variance": [std ** 2 for std in data["std"]], "Sum": [mean * npres for mean, npres in zip(data["mean"], data["npres"])], "Skewness": [float(skew) for skew in data["skew"]], "Kurtosis": [float(kurt) for kurt in data["kurt"]], @@ -734,7 +754,7 @@ def render_comparison_grid(itmdt: Intermediate, cfg: Config) -> Dict[str, Any]: df_labels, baseline if len(df) > 1 else 0, target, - df_list + df_list, ) elif is_dtype(dtp, Continuous()): if cfg.diff.density: @@ -753,7 +773,7 @@ def render_comparison_grid(itmdt: Intermediate, cfg: Config) -> Dict[str, Any]: df_labels, orig, target, - df_list + df_list, ) elif is_dtype(dtp, DateTime()): df, timeunit = data diff --git a/dataprep/eda/intermediate.py b/dataprep/eda/intermediate.py index 35da501d1..258ecdb03 100644 --- a/dataprep/eda/intermediate.py +++ b/dataprep/eda/intermediate.py @@ -30,11 +30,11 @@ def __init__(self, *args: Any, **kwargs: Any) -> None: visual_type = kwargs.pop("visual_type") super().__init__(**kwargs) self.visual_type = visual_type - if 'target' in kwargs: - self.target = kwargs.pop('target') - - if 'df_list' in kwargs: - self.df_list = kwargs.pop('df_list') + if "target" in kwargs: + self.target = kwargs.pop("target") + + if "df_list" in kwargs: + self.df_list = kwargs.pop("df_list") else: raise ValueError("Unsupported initialization") From abe50322edbe43b5cbb331531ea8396c0187412e Mon Sep 17 00:00:00 2001 From: Devin Lu Date: Fri, 22 Apr 2022 13:05:29 -0700 Subject: [PATCH 3/6] docs(eda): added target analysis description to docstrings and eda docs --- dataprep/eda/create_diff_report/__init__.py | 2 ++ .../eda/create_diff_report/diff_formatter.py | 13 ++++++++++ .../user_guide/eda/create_diff_report.ipynb | 26 +++++++++++++++++++ 3 files changed, 41 insertions(+) diff --git a/dataprep/eda/create_diff_report/__init__.py b/dataprep/eda/create_diff_report/__init__.py index 860be8bc2..fde97e912 100644 --- a/dataprep/eda/create_diff_report/__init__.py +++ b/dataprep/eda/create_diff_report/__init__.py @@ -36,6 +36,8 @@ def create_diff_report( ---------- df_list The DataFrames for which data are calculated. + target + Target feature to be compared against all other columns. config A dictionary for configuring the visualizations E.g. config={"hist.bins": 20} diff --git a/dataprep/eda/create_diff_report/diff_formatter.py b/dataprep/eda/create_diff_report/diff_formatter.py index a51248cc0..764e8de63 100644 --- a/dataprep/eda/create_diff_report/diff_formatter.py +++ b/dataprep/eda/create_diff_report/diff_formatter.py @@ -84,6 +84,8 @@ def format_diff_report( Currently only the 'basic' is fully implemented. progress Whether to show the progress bar. + target + Target feature to be compared against all other columns. Returns ------- @@ -122,6 +124,13 @@ def format_diff_report( def validate_target(target: str, df_list: List[pd.DataFrame]): """ Helper function, verify that target column exists + + Parameters + ---------- + target + Target feature to be compared against all other columns. + df_list + The Dataframe for which data are calculated. """ exists = False for df in df_list: @@ -140,6 +149,8 @@ def format_basic(df_list: List[pd.DataFrame], target: Optional[str], cfg: Config ---------- df_list The DataFrames for which data are calculated. + target + Target feature to be compared against all other columns. cfg The config dict user passed in. E.g. config = {"hist.bins": 20} Without user's specifications, the default is "auto" @@ -242,6 +253,8 @@ def compute_plot_data( E.g. dtype = {"a": Continuous, "b": "Nominal"} or dtype = {"a": Continuous(), "b": "nominal"} or dtype = Continuous() or dtype = "Continuous" or dtype = Continuous() + target + Target feature to be compared against all other columns. """ # pylint: disable=too-many-branches, too-many-locals diff --git a/docs/source/user_guide/eda/create_diff_report.ipynb b/docs/source/user_guide/eda/create_diff_report.ipynb index 7682299d2..d3bab593a 100644 --- a/docs/source/user_guide/eda/create_diff_report.ipynb +++ b/docs/source/user_guide/eda/create_diff_report.ipynb @@ -81,6 +81,32 @@ "source": [ "create_diff_report({\"df_train\": df_train, \"df_test\": df_test})" ] + }, + { + "cell_type": "markdown", + "id": "b15f40d6", + "metadata": {}, + "source": [ + "## Target Feature Analysis" + ] + }, + { + "cell_type": "markdown", + "id": "a79fe776", + "metadata": {}, + "source": [ + "Users can also pass in a target column to compare how the feature relates to all other features in both dataframes." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8d6bed85", + "metadata": {}, + "outputs": [], + "source": [ + "create_diff_report([df_train, df_test], target='LotArea')" + ] } ], "metadata": { From 0b801fa8568e724a69d415dd41c488154bc3fe3b Mon Sep 17 00:00:00 2001 From: Devin Lu Date: Fri, 22 Apr 2022 13:55:07 -0700 Subject: [PATCH 4/6] refactor(eda): validated numerical target dtype, removed comments --- dataprep/eda/create_diff_report/__init__.py | 10 ---------- dataprep/eda/create_diff_report/diff_formatter.py | 2 ++ dataprep/eda/diff/render.py | 8 ++++---- 3 files changed, 6 insertions(+), 14 deletions(-) diff --git a/dataprep/eda/create_diff_report/__init__.py b/dataprep/eda/create_diff_report/__init__.py index fde97e912..7607f7066 100644 --- a/dataprep/eda/create_diff_report/__init__.py +++ b/dataprep/eda/create_diff_report/__init__.py @@ -85,16 +85,6 @@ def create_diff_report( "legend_labels": components["legend_lables"], } - # {% for div in value.plots[1] %} - #
- # {{ div }} - # {% if key in context.components.dfs[1].variables %} - # {{ context.components.dfs[1].variables[key].plots[1][loop.index0] }} - # {% endif %} - #
- - # return context - template_base = ENV_LOADER.get_template("base.html") report = template_base.render(context=context, zip=zip) return Report(report) diff --git a/dataprep/eda/create_diff_report/diff_formatter.py b/dataprep/eda/create_diff_report/diff_formatter.py index 764e8de63..0f27455b9 100644 --- a/dataprep/eda/create_diff_report/diff_formatter.py +++ b/dataprep/eda/create_diff_report/diff_formatter.py @@ -282,6 +282,8 @@ def compute_plot_data( col_dtype = col_dtype[0] orig = [src for src, seq in labeled_cols.items() if col in seq] + if col == target and not is_dtype(col_dtype, Continuous_v1()): + raise ValueError("Sorry, target must be a numerical feature.") if is_dtype(col_dtype, Continuous_v1()): data.append((col, Continuous_v1(), diff_cont_calcs(srs.apply("dropna"), cfg), orig)) diff --git a/dataprep/eda/diff/render.py b/dataprep/eda/diff/render.py index 795d4c85a..66f9ce63e 100644 --- a/dataprep/eda/diff/render.py +++ b/dataprep/eda/diff/render.py @@ -101,6 +101,7 @@ def bar_viz( ("Source", "@orig"), ] + # Used to add y-padding to the graphs col1_min = df[0][col].min() col2_min = df[1][col].min() col1_max = df[0][col].max() @@ -159,6 +160,7 @@ def bar_viz( _format_axis(fig, 0, df[baseline].max(), "y") df1, df2 = df_list[0], df_list[1] + # Feature analysis here if target != col and target and col in df1.columns and col in df2.columns: col1, col2 = df_list[0][col], df_list[1][col] row_avgs_1 = [] @@ -261,7 +263,6 @@ def hist_viz( ) bottom = 0 if yscale == "linear" or df.empty else counts.min() / 2 if y_start is not None and y_end is not None: - # fig.y_range = (y_start * (1 - y_inc), y_end * (1 + y_inc)) fig.extra_y_ranges = { "Counts": Range1d(start=y_start * (1 - y_inc), end=y_end * (1 + y_inc)) } @@ -287,8 +288,6 @@ def hist_viz( fill_color=CATEGORY10[i], line_color=CATEGORY10[i], ) - # if col == 'LotFrontage': - # breakpoint() hover = HoverTool(tooltips=tooltips, attachment="vertical", mode="vline") fig.add_tools(hover) @@ -312,6 +311,7 @@ def hist_viz( fig.xaxis.axis_label = x_axis_label fig.xaxis.axis_label_standoff = 0 + # Feature analysis here if target and target != col and col in df1.columns and col in df2.columns: col1, col2 = df1[col], df2[col] source1, source2 = col1, col2 @@ -347,7 +347,7 @@ def hist_viz( y_range_name="Averages", line_width=4, ) - fig.add_layout(LinearAxis(y_range_name="Averages", axis_label="Bin Averages"), "right") + fig.add_layout(LinearAxis(y_range_name="Averages"), "right") return fig From a97859a2e67e5b8db8b04c14562ca460803bbc8d Mon Sep 17 00:00:00 2001 From: Devin Lu Date: Wed, 27 Apr 2022 21:53:44 -0700 Subject: [PATCH 5/6] refactor(eda): removed unused imports --- dataprep/eda/diff/render.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/dataprep/eda/diff/render.py b/dataprep/eda/diff/render.py index 66f9ce63e..1fc2f06c1 100644 --- a/dataprep/eda/diff/render.py +++ b/dataprep/eda/diff/render.py @@ -3,12 +3,9 @@ """ # pylint: disable=too-many-lines from turtle import color from typing import Any, Dict, List, Tuple, Optional -from sklearn.preprocessing import MinMaxScaler import math import numpy as np import pandas as pd -import dask.array as da -import matplotlib.pyplot as plt from bokeh.models import ( HoverTool, Panel, From 476d7aafbbadf47bb294dc41d44adc77044776e2 Mon Sep 17 00:00:00 2001 From: Devin Lu Date: Mon, 2 May 2022 21:23:43 -0700 Subject: [PATCH 6/6] refactor(eda): reformatted code for ci --- dataprep/clean/clean_country.py | 4 ++-- dataprep/eda/diff/render.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/dataprep/clean/clean_country.py b/dataprep/clean/clean_country.py index c8c6ab0cd..3b749558b 100644 --- a/dataprep/clean/clean_country.py +++ b/dataprep/clean/clean_country.py @@ -271,7 +271,7 @@ def _format_country( return result, 2 if val != result else 3 -@lru_cache(maxsize=2 ** 20) +@lru_cache(maxsize=2**20) def _check_country(country: str, input_formats: Tuple[str, ...], strict: bool, clean: bool) -> Any: """ Finds the index of the given country in the DATA dataframe. @@ -322,7 +322,7 @@ def _check_country(country: str, input_formats: Tuple[str, ...], strict: bool, c return (None, "unknown") if clean else False -@lru_cache(maxsize=2 ** 20) +@lru_cache(maxsize=2**20) def _check_fuzzy_dist(country: str, fuzzy_dist: int) -> Any: """ A match is found if a country has an edit distance <= fuzzy_dist diff --git a/dataprep/eda/diff/render.py b/dataprep/eda/diff/render.py index 1fc2f06c1..6953d9f32 100644 --- a/dataprep/eda/diff/render.py +++ b/dataprep/eda/diff/render.py @@ -695,7 +695,7 @@ def format_num_stats(data: Dict[str, List[Any]]) -> Dict[str, Dict[str, List[Any descriptive = { "Mean": data["mean"], "Standard Deviation": data["std"], - "Variance": [std ** 2 for std in data["std"]], + "Variance": [std**2 for std in data["std"]], "Sum": [mean * npres for mean, npres in zip(data["mean"], data["npres"])], "Skewness": [float(skew) for skew in data["skew"]], "Kurtosis": [float(kurt) for kurt in data["kurt"]],