diff --git a/dataprep/eda/create_diff_report/__init__.py b/dataprep/eda/create_diff_report/__init__.py index e48d6f8a5..7607f7066 100644 --- a/dataprep/eda/create_diff_report/__init__.py +++ b/dataprep/eda/create_diff_report/__init__.py @@ -22,6 +22,7 @@ def create_diff_report( df_list: Union[List[pd.DataFrame], Dict[str, pd.DataFrame]], + target: Optional[str] = None, config: Optional[Dict[str, Any]] = None, display: Optional[List[str]] = None, title: Optional[str] = "DataPrep Report", @@ -35,6 +36,8 @@ def create_diff_report( ---------- df_list The DataFrames for which data are calculated. + target + Target feature to be compared against all other columns. config A dictionary for configuring the visualizations E.g. config={"hist.bins": 20} @@ -63,7 +66,7 @@ def create_diff_report( _suppress_warnings() cfg = Config.from_dict(display, config) - components = format_diff_report(df_list, cfg, mode, progress) + components = format_diff_report(df_list, cfg, mode, progress, target) dict_stats = defaultdict(list) @@ -82,16 +85,6 @@ def create_diff_report( "legend_labels": components["legend_lables"], } - # {% for div in value.plots[1] %} - #
- # {{ div }} - # {% if key in context.components.dfs[1].variables %} - # {{ context.components.dfs[1].variables[key].plots[1][loop.index0] }} - # {% endif %} - #
- - # return context - template_base = ENV_LOADER.get_template("base.html") report = template_base.render(context=context, zip=zip) return Report(report) diff --git a/dataprep/eda/create_diff_report/diff_formatter.py b/dataprep/eda/create_diff_report/diff_formatter.py index 0dcd7bd67..0f27455b9 100644 --- a/dataprep/eda/create_diff_report/diff_formatter.py +++ b/dataprep/eda/create_diff_report/diff_formatter.py @@ -68,6 +68,7 @@ def format_diff_report( cfg: Config, mode: Optional[str], progress: bool = True, + target: Optional[str] = None, ) -> Dict[str, Any]: """ Format the data and figures needed by create_diff_report @@ -83,6 +84,8 @@ def format_diff_report( Currently only the 'basic' is fully implemented. progress Whether to show the progress bar. + target + Target feature to be compared against all other columns. Returns ------- @@ -110,13 +113,35 @@ def format_diff_report( if mode == "basic": # note: we need the type ignore comment for mypy otherwise it complains because # it doesn't realize that we converted df_list to a list if it's a dictionary - report = format_basic(df_list, cfg) # type: ignore + if target: + validate_target(target, df_list) + report = format_basic(df_list, target, cfg) # type: ignore else: raise ValueError(f"Unknown mode: {mode}") return report -def format_basic(df_list: List[pd.DataFrame], cfg: Config) -> Dict[str, Any]: +def validate_target(target: str, df_list: List[pd.DataFrame]): + """ + Helper function, verify that target column exists + + Parameters + ---------- + target + Target feature to be compared against all other columns. + df_list + The Dataframe for which data are calculated. + """ + exists = False + for df in df_list: + if target in df.columns: + exists = True + break + if not exists: + raise ValueError(f"Sorry, {target} is not a valid column") + + +def format_basic(df_list: List[pd.DataFrame], target: Optional[str], cfg: Config) -> Dict[str, Any]: """ Format basic version. @@ -124,6 +149,8 @@ def format_basic(df_list: List[pd.DataFrame], cfg: Config) -> Dict[str, Any]: ---------- df_list The DataFrames for which data are calculated. + target + Target feature to be compared against all other columns. cfg The config dict user passed in. E.g. config = {"hist.bins": 20} Without user's specifications, the default is "auto" @@ -158,7 +185,7 @@ def format_basic(df_list: List[pd.DataFrame], cfg: Config) -> Dict[str, Any]: # data = dask.compute(data) delayed_results.append(data) - res_plots = dask.delayed(_format_plots)(cfg=cfg, df_list=df_list) + res_plots = dask.delayed(_format_plots)(cfg=cfg, df_list=df_list, target=target) dask_results["df_computations"] = delayed_results dask_results["plots"] = res_plots @@ -211,7 +238,7 @@ def basic_computations(df: EDAFrame, cfg: Config) -> Dict[str, Any]: def compute_plot_data( - df_list: List[dd.DataFrame], cfg: Config, dtype: Optional[DTypeDef] + pd_list: List[pd.DataFrame], cfg: Config, dtype: Optional[DTypeDef], target: Optional[str] ) -> Intermediate: """ Compute function for create_diff_report's plots @@ -226,9 +253,15 @@ def compute_plot_data( E.g. dtype = {"a": Continuous, "b": "Nominal"} or dtype = {"a": Continuous(), "b": "nominal"} or dtype = Continuous() or dtype = "Continuous" or dtype = Continuous() + target + Target feature to be compared against all other columns. """ # pylint: disable=too-many-branches, too-many-locals + df_list = list(map(to_dask, pd_list)) + for i, _ in enumerate(df_list): + df_list[i].columns = df_list[i].columns.astype(str) + dfs = Dfs(df_list) dfs_cols = dfs.columns.apply("to_list").data @@ -249,6 +282,8 @@ def compute_plot_data( col_dtype = col_dtype[0] orig = [src for src, seq in labeled_cols.items() if col in seq] + if col == target and not is_dtype(col_dtype, Continuous_v1()): + raise ValueError("Sorry, target must be a numerical feature.") if is_dtype(col_dtype, Continuous_v1()): data.append((col, Continuous_v1(), diff_cont_calcs(srs.apply("dropna"), cfg), orig)) @@ -277,7 +312,9 @@ def compute_plot_data( elif is_dtype(dtp, DateTime_v1()): plot_data.append((col, dtp, dask.compute(*datum), orig)) # workaround - return Intermediate(data=plot_data, stats=stats, visual_type="comparison_grid") + return Intermediate( + data=plot_data, stats=stats, visual_type="comparison_grid", target=target, df_list=pd_list + ) def _compute_variables(df: EDAFrame, cfg: Config) -> Dict[str, Any]: @@ -407,14 +444,11 @@ def _format_variables(df: EDAFrame, cfg: Config, data: Dict[str, Any]) -> Dict[s def _format_plots( - df_list: Union[List[pd.DataFrame], Dict[str, pd.DataFrame]], cfg: Config + df_list: Union[List[pd.DataFrame], Dict[str, pd.DataFrame]], cfg: Config, target: Optional[str] ) -> Dict[str, Any]: """Formatting of plots section""" - df_list = list(map(to_dask, df_list)) - for i, _ in enumerate(df_list): - df_list[i].columns = df_list[i].columns.astype(str) - itmdt = compute_plot_data(df_list=df_list, cfg=cfg, dtype=None) + itmdt = compute_plot_data(pd_list=df_list, cfg=cfg, dtype=None, target=target) return render_diff(itmdt, cfg=cfg) diff --git a/dataprep/eda/diff/render.py b/dataprep/eda/diff/render.py index 7a49bd679..6953d9f32 100644 --- a/dataprep/eda/diff/render.py +++ b/dataprep/eda/diff/render.py @@ -1,8 +1,8 @@ """ This module implements the visualization for the plot_diff function. """ # pylint: disable=too-many-lines +from turtle import color from typing import Any, Dict, List, Tuple, Optional - import math import numpy as np import pandas as pd @@ -11,9 +11,11 @@ Panel, FactorRange, ) -from bokeh.plotting import Figure, figure +from bokeh.plotting import Figure, figure, show from bokeh.transform import dodge from bokeh.layouts import row +from bokeh.models.ranges import Range1d +from bokeh.models import LinearAxis from ..configs import Config from ..dtypes import Continuous, DateTime, Nominal, is_dtype @@ -78,6 +80,8 @@ def bar_viz( orig: List[str], df_labels: List[str], baseline: int, + target: Optional[str] = None, + df_list: Optional[List[pd.DataFrame]] = None, ) -> Figure: """ Render a bar chart @@ -94,6 +98,13 @@ def bar_viz( ("Source", "@orig"), ] + # Used to add y-padding to the graphs + col1_min = df[0][col].min() + col2_min = df[1][col].min() + col1_max = df[0][col].max() + col2_max = df[1][col].max() + y_inc = 0.05 + if show_yticks: if len(df[baseline]) > 10: plot_width = 28 * len(df[baseline]) @@ -106,12 +117,15 @@ def bar_viz( tools="hover", x_range=list(df[baseline].index), y_axis_type=yscale, + y_range=(min(col1_min, col2_min) * (1 - y_inc), max(col1_max, col2_max) * (1 + y_inc)), ) - + row_names = None offset = np.linspace(-0.08 * len(df), 0.08 * len(df), len(df)) if len(df) > 1 else [0] for i, (nrow, data) in enumerate(zip(nrows, df)): data["pct"] = data[col] / nrow * 100 data.index = [str(val) for val in data.index] + if row_names is None: + row_names = data.index data["orig"] = orig[i] fig.vbar( @@ -126,7 +140,6 @@ def bar_viz( tweak_figure(fig, "bar", show_yticks) fig.yaxis.axis_label = "Count" - x_axis_label = "" if ttl_grps > len(df[baseline]): x_axis_label += f"Top {len(df[baseline])} of {ttl_grps} {col}" @@ -142,6 +155,33 @@ def bar_viz( if show_yticks and yscale == "linear": _format_axis(fig, 0, df[baseline].max(), "y") + + df1, df2 = df_list[0], df_list[1] + # Feature analysis here + if target != col and target and col in df1.columns and col in df2.columns: + col1, col2 = df_list[0][col], df_list[1][col] + row_avgs_1 = [] + row_avgs_2 = [] + for names in row_names: + row_avgs_1.append(df_list[0][target][col1 == names].mean()) + row_avgs_2.append(df_list[1][target][col2 == names].mean()) + + row_avgs_1 = [0 if math.isnan(x) else x for x in row_avgs_1] + row_avgs_2 = [0 if math.isnan(x) else x for x in row_avgs_2] + fig.extra_y_ranges = { + "Averages": Range1d( + start=min(row_avgs_1 + row_avgs_2) * (1 - y_inc), + end=max(row_avgs_1 + row_avgs_2) * (1 + y_inc), + ) + } + fig.multi_line( + [row_names, row_names], + [row_avgs_1, row_avgs_2], + color=["navy", "firebrick"], + y_range_name="Averages", + line_width=4, + ) + fig.add_layout(LinearAxis(y_range_name="Averages"), "right") return fig @@ -155,18 +195,44 @@ def hist_viz( show_yticks: bool, df_labels: List[str], orig: Optional[List[str]] = None, + target: Optional[str] = None, + df_list: Optional[List[pd.DataFrame]] = None, ) -> Figure: """ Render a histogram """ # pylint: disable=too-many-arguments,too-many-locals - tooltips = [ ("Bin", "@intvl"), ("Frequency", "@freq"), ("Percent", "@pct{0.2f}%"), ("Source", "@orig"), ] + df1, df2 = df_list[0], df_list[1] + y_inc = 0.05 + tooltips = [ + ("Bin", "@intvl"), + ("Frequency", "@freq"), + ("Percent", "@pct{0.2f}%"), + ("Source", "@orig"), + ] + fig = None + + y_start, y_end = None, None + counts_list = [] + if target and target != col and col in df1.columns and col in df2.columns: + for hst in hist: + counts, bins = hst + counts_list.append(counts) + + counts_min_1 = min(counts_list[0]) + counts_min_2 = min(counts_list[1]) + + counts_max_1 = max(counts_list[0]) + counts_max_2 = max(counts_list[1]) + + y_start, y_end = min(counts_min_1, counts_min_2), max(counts_max_1, counts_max_2) + fig = Figure( plot_height=plot_height, plot_width=plot_width, @@ -174,9 +240,10 @@ def hist_viz( toolbar_location=None, y_axis_type=yscale, ) - + bins_list = [] for i, hst in enumerate(hist): counts, bins = hst + bins_list.append(bins) if sum(counts) == 0: fig.rect(x=0, y=0, width=0, height=0) continue @@ -192,16 +259,33 @@ def hist_viz( } ) bottom = 0 if yscale == "linear" or df.empty else counts.min() / 2 - fig.quad( - source=df, - left="left", - right="right", - bottom=bottom, - alpha=0.5, - top="freq", - fill_color=CATEGORY10[i], - line_color=CATEGORY10[i], - ) + if y_start is not None and y_end is not None: + fig.extra_y_ranges = { + "Counts": Range1d(start=y_start * (1 - y_inc), end=y_end * (1 + y_inc)) + } + fig.quad( + source=df, + left="left", + right="right", + bottom=bottom, + alpha=0.5, + top="freq", + fill_color=CATEGORY10[i], + line_color=CATEGORY10[i], + y_range_name="Counts", + ) + else: + fig.quad( + source=df, + left="left", + right="right", + bottom=bottom, + alpha=0.5, + top="freq", + fill_color=CATEGORY10[i], + line_color=CATEGORY10[i], + ) + hover = HoverTool(tooltips=tooltips, attachment="vertical", mode="vline") fig.add_tools(hover) @@ -224,6 +308,43 @@ def hist_viz( fig.xaxis.axis_label = x_axis_label fig.xaxis.axis_label_standoff = 0 + # Feature analysis here + if target and target != col and col in df1.columns and col in df2.columns: + col1, col2 = df1[col], df2[col] + source1, source2 = col1, col2 + col1 = col1[~np.isnan(col1)] + col2 = col2[~np.isnan(col2)] + num_bins1 = len(bins_list[0]) - 1 + num_bins2 = len(bins_list[1]) - 1 + bins_1, bins_2 = bins_list[0], bins_list[1] + + df1_source_bins_series = pd.cut(source1, bins=bins_1, labels=False) + df1_bin_averages = [None] * num_bins1 + + df2_source_bins_series = pd.cut(source2, bins=bins_2, labels=False) + df2_bin_averages = [None] * num_bins2 + + for b in range(num_bins1): + df1_bin_averages[b] = df1[target][df1_source_bins_series == b].mean() + for b in range(num_bins2): + df2_bin_averages[b] = df2[target][df2_source_bins_series == b].mean() + + df1_bin_averages = [0 if math.isnan(x) else x for x in df1_bin_averages] + df2_bin_averages = [0 if math.isnan(x) else x for x in df2_bin_averages] + max_range = max(df1_bin_averages + df2_bin_averages) + min_range = min(df1_bin_averages + df2_bin_averages) + + fig.extra_y_ranges["Averages"] = Range1d( + start=min_range * (1 - y_inc), end=max_range * (1 + y_inc) + ) + fig.multi_line( + [bins_1, bins_2], + [df1_bin_averages, df2_bin_averages], + color=["navy", "firebrick"], + y_range_name="Averages", + line_width=4, + ) + fig.add_layout(LinearAxis(y_range_name="Averages"), "right") return fig @@ -610,6 +731,9 @@ def render_comparison_grid(itmdt: Intermediate, cfg: Config) -> Dict[str, Any]: nrows = itmdt["stats"]["nrows"] titles: List[str] = [] + df_list = itmdt.df_list + target = itmdt.target + for col, dtp, data, orig in itmdt["data"]: fig = None if is_dtype(dtp, Nominal()): @@ -626,6 +750,8 @@ def render_comparison_grid(itmdt: Intermediate, cfg: Config) -> Dict[str, Any]: orig, df_labels, baseline if len(df) > 1 else 0, + target, + df_list, ) elif is_dtype(dtp, Continuous()): if cfg.diff.density: @@ -643,6 +769,8 @@ def render_comparison_grid(itmdt: Intermediate, cfg: Config) -> Dict[str, Any]: False, df_labels, orig, + target, + df_list, ) elif is_dtype(dtp, DateTime()): df, timeunit = data @@ -760,7 +888,6 @@ def render_diff(itmdt: Intermediate, cfg: Config) -> Dict[str, Any]: cfg Config instance """ - if itmdt.visual_type == "comparison_grid": visual_elem = render_comparison_grid(itmdt, cfg) if itmdt.visual_type == "comparison_continuous": diff --git a/dataprep/eda/intermediate.py b/dataprep/eda/intermediate.py index 331e4fb0b..258ecdb03 100644 --- a/dataprep/eda/intermediate.py +++ b/dataprep/eda/intermediate.py @@ -30,6 +30,11 @@ def __init__(self, *args: Any, **kwargs: Any) -> None: visual_type = kwargs.pop("visual_type") super().__init__(**kwargs) self.visual_type = visual_type + if "target" in kwargs: + self.target = kwargs.pop("target") + + if "df_list" in kwargs: + self.df_list = kwargs.pop("df_list") else: raise ValueError("Unsupported initialization") diff --git a/docs/source/user_guide/eda/create_diff_report.ipynb b/docs/source/user_guide/eda/create_diff_report.ipynb index 7682299d2..d3bab593a 100644 --- a/docs/source/user_guide/eda/create_diff_report.ipynb +++ b/docs/source/user_guide/eda/create_diff_report.ipynb @@ -81,6 +81,32 @@ "source": [ "create_diff_report({\"df_train\": df_train, \"df_test\": df_test})" ] + }, + { + "cell_type": "markdown", + "id": "b15f40d6", + "metadata": {}, + "source": [ + "## Target Feature Analysis" + ] + }, + { + "cell_type": "markdown", + "id": "a79fe776", + "metadata": {}, + "source": [ + "Users can also pass in a target column to compare how the feature relates to all other features in both dataframes." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8d6bed85", + "metadata": {}, + "outputs": [], + "source": [ + "create_diff_report([df_train, df_test], target='LotArea')" + ] } ], "metadata": {