diff --git a/dataprep/eda/create_diff_report/__init__.py b/dataprep/eda/create_diff_report/__init__.py
index e48d6f8a5..7607f7066 100644
--- a/dataprep/eda/create_diff_report/__init__.py
+++ b/dataprep/eda/create_diff_report/__init__.py
@@ -22,6 +22,7 @@
def create_diff_report(
df_list: Union[List[pd.DataFrame], Dict[str, pd.DataFrame]],
+ target: Optional[str] = None,
config: Optional[Dict[str, Any]] = None,
display: Optional[List[str]] = None,
title: Optional[str] = "DataPrep Report",
@@ -35,6 +36,8 @@ def create_diff_report(
----------
df_list
The DataFrames for which data are calculated.
+ target
+ Target feature to be compared against all other columns.
config
A dictionary for configuring the visualizations
E.g. config={"hist.bins": 20}
@@ -63,7 +66,7 @@ def create_diff_report(
_suppress_warnings()
cfg = Config.from_dict(display, config)
- components = format_diff_report(df_list, cfg, mode, progress)
+ components = format_diff_report(df_list, cfg, mode, progress, target)
dict_stats = defaultdict(list)
@@ -82,16 +85,6 @@ def create_diff_report(
"legend_labels": components["legend_lables"],
}
- # {% for div in value.plots[1] %}
- #
- # {{ div }}
- # {% if key in context.components.dfs[1].variables %}
- # {{ context.components.dfs[1].variables[key].plots[1][loop.index0] }}
- # {% endif %}
- #
-
- # return context
-
template_base = ENV_LOADER.get_template("base.html")
report = template_base.render(context=context, zip=zip)
return Report(report)
diff --git a/dataprep/eda/create_diff_report/diff_formatter.py b/dataprep/eda/create_diff_report/diff_formatter.py
index 0dcd7bd67..0f27455b9 100644
--- a/dataprep/eda/create_diff_report/diff_formatter.py
+++ b/dataprep/eda/create_diff_report/diff_formatter.py
@@ -68,6 +68,7 @@ def format_diff_report(
cfg: Config,
mode: Optional[str],
progress: bool = True,
+ target: Optional[str] = None,
) -> Dict[str, Any]:
"""
Format the data and figures needed by create_diff_report
@@ -83,6 +84,8 @@ def format_diff_report(
Currently only the 'basic' is fully implemented.
progress
Whether to show the progress bar.
+ target
+ Target feature to be compared against all other columns.
Returns
-------
@@ -110,13 +113,35 @@ def format_diff_report(
if mode == "basic":
# note: we need the type ignore comment for mypy otherwise it complains because
# it doesn't realize that we converted df_list to a list if it's a dictionary
- report = format_basic(df_list, cfg) # type: ignore
+ if target:
+ validate_target(target, df_list)
+ report = format_basic(df_list, target, cfg) # type: ignore
else:
raise ValueError(f"Unknown mode: {mode}")
return report
-def format_basic(df_list: List[pd.DataFrame], cfg: Config) -> Dict[str, Any]:
+def validate_target(target: str, df_list: List[pd.DataFrame]):
+ """
+ Helper function, verify that target column exists
+
+ Parameters
+ ----------
+ target
+ Target feature to be compared against all other columns.
+ df_list
+ The Dataframe for which data are calculated.
+ """
+ exists = False
+ for df in df_list:
+ if target in df.columns:
+ exists = True
+ break
+ if not exists:
+ raise ValueError(f"Sorry, {target} is not a valid column")
+
+
+def format_basic(df_list: List[pd.DataFrame], target: Optional[str], cfg: Config) -> Dict[str, Any]:
"""
Format basic version.
@@ -124,6 +149,8 @@ def format_basic(df_list: List[pd.DataFrame], cfg: Config) -> Dict[str, Any]:
----------
df_list
The DataFrames for which data are calculated.
+ target
+ Target feature to be compared against all other columns.
cfg
The config dict user passed in. E.g. config = {"hist.bins": 20}
Without user's specifications, the default is "auto"
@@ -158,7 +185,7 @@ def format_basic(df_list: List[pd.DataFrame], cfg: Config) -> Dict[str, Any]:
# data = dask.compute(data)
delayed_results.append(data)
- res_plots = dask.delayed(_format_plots)(cfg=cfg, df_list=df_list)
+ res_plots = dask.delayed(_format_plots)(cfg=cfg, df_list=df_list, target=target)
dask_results["df_computations"] = delayed_results
dask_results["plots"] = res_plots
@@ -211,7 +238,7 @@ def basic_computations(df: EDAFrame, cfg: Config) -> Dict[str, Any]:
def compute_plot_data(
- df_list: List[dd.DataFrame], cfg: Config, dtype: Optional[DTypeDef]
+ pd_list: List[pd.DataFrame], cfg: Config, dtype: Optional[DTypeDef], target: Optional[str]
) -> Intermediate:
"""
Compute function for create_diff_report's plots
@@ -226,9 +253,15 @@ def compute_plot_data(
E.g. dtype = {"a": Continuous, "b": "Nominal"} or
dtype = {"a": Continuous(), "b": "nominal"}
or dtype = Continuous() or dtype = "Continuous" or dtype = Continuous()
+ target
+ Target feature to be compared against all other columns.
"""
# pylint: disable=too-many-branches, too-many-locals
+ df_list = list(map(to_dask, pd_list))
+ for i, _ in enumerate(df_list):
+ df_list[i].columns = df_list[i].columns.astype(str)
+
dfs = Dfs(df_list)
dfs_cols = dfs.columns.apply("to_list").data
@@ -249,6 +282,8 @@ def compute_plot_data(
col_dtype = col_dtype[0]
orig = [src for src, seq in labeled_cols.items() if col in seq]
+ if col == target and not is_dtype(col_dtype, Continuous_v1()):
+ raise ValueError("Sorry, target must be a numerical feature.")
if is_dtype(col_dtype, Continuous_v1()):
data.append((col, Continuous_v1(), diff_cont_calcs(srs.apply("dropna"), cfg), orig))
@@ -277,7 +312,9 @@ def compute_plot_data(
elif is_dtype(dtp, DateTime_v1()):
plot_data.append((col, dtp, dask.compute(*datum), orig)) # workaround
- return Intermediate(data=plot_data, stats=stats, visual_type="comparison_grid")
+ return Intermediate(
+ data=plot_data, stats=stats, visual_type="comparison_grid", target=target, df_list=pd_list
+ )
def _compute_variables(df: EDAFrame, cfg: Config) -> Dict[str, Any]:
@@ -407,14 +444,11 @@ def _format_variables(df: EDAFrame, cfg: Config, data: Dict[str, Any]) -> Dict[s
def _format_plots(
- df_list: Union[List[pd.DataFrame], Dict[str, pd.DataFrame]], cfg: Config
+ df_list: Union[List[pd.DataFrame], Dict[str, pd.DataFrame]], cfg: Config, target: Optional[str]
) -> Dict[str, Any]:
"""Formatting of plots section"""
- df_list = list(map(to_dask, df_list))
- for i, _ in enumerate(df_list):
- df_list[i].columns = df_list[i].columns.astype(str)
- itmdt = compute_plot_data(df_list=df_list, cfg=cfg, dtype=None)
+ itmdt = compute_plot_data(pd_list=df_list, cfg=cfg, dtype=None, target=target)
return render_diff(itmdt, cfg=cfg)
diff --git a/dataprep/eda/diff/render.py b/dataprep/eda/diff/render.py
index 7a49bd679..6953d9f32 100644
--- a/dataprep/eda/diff/render.py
+++ b/dataprep/eda/diff/render.py
@@ -1,8 +1,8 @@
"""
This module implements the visualization for the plot_diff function.
""" # pylint: disable=too-many-lines
+from turtle import color
from typing import Any, Dict, List, Tuple, Optional
-
import math
import numpy as np
import pandas as pd
@@ -11,9 +11,11 @@
Panel,
FactorRange,
)
-from bokeh.plotting import Figure, figure
+from bokeh.plotting import Figure, figure, show
from bokeh.transform import dodge
from bokeh.layouts import row
+from bokeh.models.ranges import Range1d
+from bokeh.models import LinearAxis
from ..configs import Config
from ..dtypes import Continuous, DateTime, Nominal, is_dtype
@@ -78,6 +80,8 @@ def bar_viz(
orig: List[str],
df_labels: List[str],
baseline: int,
+ target: Optional[str] = None,
+ df_list: Optional[List[pd.DataFrame]] = None,
) -> Figure:
"""
Render a bar chart
@@ -94,6 +98,13 @@ def bar_viz(
("Source", "@orig"),
]
+ # Used to add y-padding to the graphs
+ col1_min = df[0][col].min()
+ col2_min = df[1][col].min()
+ col1_max = df[0][col].max()
+ col2_max = df[1][col].max()
+ y_inc = 0.05
+
if show_yticks:
if len(df[baseline]) > 10:
plot_width = 28 * len(df[baseline])
@@ -106,12 +117,15 @@ def bar_viz(
tools="hover",
x_range=list(df[baseline].index),
y_axis_type=yscale,
+ y_range=(min(col1_min, col2_min) * (1 - y_inc), max(col1_max, col2_max) * (1 + y_inc)),
)
-
+ row_names = None
offset = np.linspace(-0.08 * len(df), 0.08 * len(df), len(df)) if len(df) > 1 else [0]
for i, (nrow, data) in enumerate(zip(nrows, df)):
data["pct"] = data[col] / nrow * 100
data.index = [str(val) for val in data.index]
+ if row_names is None:
+ row_names = data.index
data["orig"] = orig[i]
fig.vbar(
@@ -126,7 +140,6 @@ def bar_viz(
tweak_figure(fig, "bar", show_yticks)
fig.yaxis.axis_label = "Count"
-
x_axis_label = ""
if ttl_grps > len(df[baseline]):
x_axis_label += f"Top {len(df[baseline])} of {ttl_grps} {col}"
@@ -142,6 +155,33 @@ def bar_viz(
if show_yticks and yscale == "linear":
_format_axis(fig, 0, df[baseline].max(), "y")
+
+ df1, df2 = df_list[0], df_list[1]
+ # Feature analysis here
+ if target != col and target and col in df1.columns and col in df2.columns:
+ col1, col2 = df_list[0][col], df_list[1][col]
+ row_avgs_1 = []
+ row_avgs_2 = []
+ for names in row_names:
+ row_avgs_1.append(df_list[0][target][col1 == names].mean())
+ row_avgs_2.append(df_list[1][target][col2 == names].mean())
+
+ row_avgs_1 = [0 if math.isnan(x) else x for x in row_avgs_1]
+ row_avgs_2 = [0 if math.isnan(x) else x for x in row_avgs_2]
+ fig.extra_y_ranges = {
+ "Averages": Range1d(
+ start=min(row_avgs_1 + row_avgs_2) * (1 - y_inc),
+ end=max(row_avgs_1 + row_avgs_2) * (1 + y_inc),
+ )
+ }
+ fig.multi_line(
+ [row_names, row_names],
+ [row_avgs_1, row_avgs_2],
+ color=["navy", "firebrick"],
+ y_range_name="Averages",
+ line_width=4,
+ )
+ fig.add_layout(LinearAxis(y_range_name="Averages"), "right")
return fig
@@ -155,18 +195,44 @@ def hist_viz(
show_yticks: bool,
df_labels: List[str],
orig: Optional[List[str]] = None,
+ target: Optional[str] = None,
+ df_list: Optional[List[pd.DataFrame]] = None,
) -> Figure:
"""
Render a histogram
"""
# pylint: disable=too-many-arguments,too-many-locals
-
tooltips = [
("Bin", "@intvl"),
("Frequency", "@freq"),
("Percent", "@pct{0.2f}%"),
("Source", "@orig"),
]
+ df1, df2 = df_list[0], df_list[1]
+ y_inc = 0.05
+ tooltips = [
+ ("Bin", "@intvl"),
+ ("Frequency", "@freq"),
+ ("Percent", "@pct{0.2f}%"),
+ ("Source", "@orig"),
+ ]
+ fig = None
+
+ y_start, y_end = None, None
+ counts_list = []
+ if target and target != col and col in df1.columns and col in df2.columns:
+ for hst in hist:
+ counts, bins = hst
+ counts_list.append(counts)
+
+ counts_min_1 = min(counts_list[0])
+ counts_min_2 = min(counts_list[1])
+
+ counts_max_1 = max(counts_list[0])
+ counts_max_2 = max(counts_list[1])
+
+ y_start, y_end = min(counts_min_1, counts_min_2), max(counts_max_1, counts_max_2)
+
fig = Figure(
plot_height=plot_height,
plot_width=plot_width,
@@ -174,9 +240,10 @@ def hist_viz(
toolbar_location=None,
y_axis_type=yscale,
)
-
+ bins_list = []
for i, hst in enumerate(hist):
counts, bins = hst
+ bins_list.append(bins)
if sum(counts) == 0:
fig.rect(x=0, y=0, width=0, height=0)
continue
@@ -192,16 +259,33 @@ def hist_viz(
}
)
bottom = 0 if yscale == "linear" or df.empty else counts.min() / 2
- fig.quad(
- source=df,
- left="left",
- right="right",
- bottom=bottom,
- alpha=0.5,
- top="freq",
- fill_color=CATEGORY10[i],
- line_color=CATEGORY10[i],
- )
+ if y_start is not None and y_end is not None:
+ fig.extra_y_ranges = {
+ "Counts": Range1d(start=y_start * (1 - y_inc), end=y_end * (1 + y_inc))
+ }
+ fig.quad(
+ source=df,
+ left="left",
+ right="right",
+ bottom=bottom,
+ alpha=0.5,
+ top="freq",
+ fill_color=CATEGORY10[i],
+ line_color=CATEGORY10[i],
+ y_range_name="Counts",
+ )
+ else:
+ fig.quad(
+ source=df,
+ left="left",
+ right="right",
+ bottom=bottom,
+ alpha=0.5,
+ top="freq",
+ fill_color=CATEGORY10[i],
+ line_color=CATEGORY10[i],
+ )
+
hover = HoverTool(tooltips=tooltips, attachment="vertical", mode="vline")
fig.add_tools(hover)
@@ -224,6 +308,43 @@ def hist_viz(
fig.xaxis.axis_label = x_axis_label
fig.xaxis.axis_label_standoff = 0
+ # Feature analysis here
+ if target and target != col and col in df1.columns and col in df2.columns:
+ col1, col2 = df1[col], df2[col]
+ source1, source2 = col1, col2
+ col1 = col1[~np.isnan(col1)]
+ col2 = col2[~np.isnan(col2)]
+ num_bins1 = len(bins_list[0]) - 1
+ num_bins2 = len(bins_list[1]) - 1
+ bins_1, bins_2 = bins_list[0], bins_list[1]
+
+ df1_source_bins_series = pd.cut(source1, bins=bins_1, labels=False)
+ df1_bin_averages = [None] * num_bins1
+
+ df2_source_bins_series = pd.cut(source2, bins=bins_2, labels=False)
+ df2_bin_averages = [None] * num_bins2
+
+ for b in range(num_bins1):
+ df1_bin_averages[b] = df1[target][df1_source_bins_series == b].mean()
+ for b in range(num_bins2):
+ df2_bin_averages[b] = df2[target][df2_source_bins_series == b].mean()
+
+ df1_bin_averages = [0 if math.isnan(x) else x for x in df1_bin_averages]
+ df2_bin_averages = [0 if math.isnan(x) else x for x in df2_bin_averages]
+ max_range = max(df1_bin_averages + df2_bin_averages)
+ min_range = min(df1_bin_averages + df2_bin_averages)
+
+ fig.extra_y_ranges["Averages"] = Range1d(
+ start=min_range * (1 - y_inc), end=max_range * (1 + y_inc)
+ )
+ fig.multi_line(
+ [bins_1, bins_2],
+ [df1_bin_averages, df2_bin_averages],
+ color=["navy", "firebrick"],
+ y_range_name="Averages",
+ line_width=4,
+ )
+ fig.add_layout(LinearAxis(y_range_name="Averages"), "right")
return fig
@@ -610,6 +731,9 @@ def render_comparison_grid(itmdt: Intermediate, cfg: Config) -> Dict[str, Any]:
nrows = itmdt["stats"]["nrows"]
titles: List[str] = []
+ df_list = itmdt.df_list
+ target = itmdt.target
+
for col, dtp, data, orig in itmdt["data"]:
fig = None
if is_dtype(dtp, Nominal()):
@@ -626,6 +750,8 @@ def render_comparison_grid(itmdt: Intermediate, cfg: Config) -> Dict[str, Any]:
orig,
df_labels,
baseline if len(df) > 1 else 0,
+ target,
+ df_list,
)
elif is_dtype(dtp, Continuous()):
if cfg.diff.density:
@@ -643,6 +769,8 @@ def render_comparison_grid(itmdt: Intermediate, cfg: Config) -> Dict[str, Any]:
False,
df_labels,
orig,
+ target,
+ df_list,
)
elif is_dtype(dtp, DateTime()):
df, timeunit = data
@@ -760,7 +888,6 @@ def render_diff(itmdt: Intermediate, cfg: Config) -> Dict[str, Any]:
cfg
Config instance
"""
-
if itmdt.visual_type == "comparison_grid":
visual_elem = render_comparison_grid(itmdt, cfg)
if itmdt.visual_type == "comparison_continuous":
diff --git a/dataprep/eda/intermediate.py b/dataprep/eda/intermediate.py
index 331e4fb0b..258ecdb03 100644
--- a/dataprep/eda/intermediate.py
+++ b/dataprep/eda/intermediate.py
@@ -30,6 +30,11 @@ def __init__(self, *args: Any, **kwargs: Any) -> None:
visual_type = kwargs.pop("visual_type")
super().__init__(**kwargs)
self.visual_type = visual_type
+ if "target" in kwargs:
+ self.target = kwargs.pop("target")
+
+ if "df_list" in kwargs:
+ self.df_list = kwargs.pop("df_list")
else:
raise ValueError("Unsupported initialization")
diff --git a/docs/source/user_guide/eda/create_diff_report.ipynb b/docs/source/user_guide/eda/create_diff_report.ipynb
index 7682299d2..d3bab593a 100644
--- a/docs/source/user_guide/eda/create_diff_report.ipynb
+++ b/docs/source/user_guide/eda/create_diff_report.ipynb
@@ -81,6 +81,32 @@
"source": [
"create_diff_report({\"df_train\": df_train, \"df_test\": df_test})"
]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "b15f40d6",
+ "metadata": {},
+ "source": [
+ "## Target Feature Analysis"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "a79fe776",
+ "metadata": {},
+ "source": [
+ "Users can also pass in a target column to compare how the feature relates to all other features in both dataframes."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "8d6bed85",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "create_diff_report([df_train, df_test], target='LotArea')"
+ ]
}
],
"metadata": {