Skip to content

Commit

Permalink
plot: Avoid non-numeric column aggregation errors
Browse files Browse the repository at this point in the history
With a recent version of pandas, a object column that cannot be
aggregated (e.g. some JSON representations) will raise an error.

Need to use the numeric_only=True counterpart whenever aggregating.
  • Loading branch information
wookayin committed Nov 9, 2023
1 parent d83799c commit 675aa5c
Showing 1 changed file with 13 additions and 11 deletions.
24 changes: 13 additions & 11 deletions expt/plot.py
Original file line number Diff line number Diff line change
Expand Up @@ -396,10 +396,10 @@ def _representative_and_err(h: Hypothesis) -> Tuple[

representative: pd.DataFrame = (
representative_fn(h) if representative_fn \
else cast(pd.DataFrame, h.grouped.mean())
else cast(pd.DataFrame, h.mean(numeric_only=True))
)
err_range: Tuple[pd.DataFrame, pd.DataFrame]
std = err_fn(h) if err_fn else h.grouped.std()
std = err_fn(h) if err_fn else h.std(numeric_only=True)

# Condition check: when representative_fn is given,
# err_fn should return a range (i.e., tuple)
Expand All @@ -412,7 +412,7 @@ def _representative_and_err(h: Hypothesis) -> Tuple[
f"err_fn returned: {std}")

if isinstance(std, pd.DataFrame):
mean = h.grouped.mean()
mean = h.mean(numeric_only=True)
err_range = (mean - std, mean + std)
return representative, err_range

Expand All @@ -438,7 +438,7 @@ def _representative_and_err(h: Hypothesis) -> Tuple[
# might have different x values --- we need to interpolate.
# (i) check if the x-column is consistent?
x = kwargs['x']
if n_samples is None and np.any(self._parent.grouped.nunique()[x] > 1):
if n_samples is None and np.any(self._parent.grouped[x].nunique() > 1):
warnings.warn(
f"The x value (column `{x}`) is not consistent "
"over different runs. Automatically falling back to the "
Expand Down Expand Up @@ -496,6 +496,15 @@ def _should_include_column(col_name: str) -> bool:
if not col_name: # empty name
return False

# include only numeric values (integer or float).
# (check from the originial hypothesis dataframe, not from representative)
for df in self._dataframes:
if col_name in df and df[col_name].dtype.kind not in ('i', 'f'):
if not _use_default_y:
raise ValueError(f"Invalid y: the column `{col_name}` "
f"has a non-numeric type: {df[col_name].dtype}.")
return False

# unknown column in the DataFrame
# Note that additional extra_y columns are also accepted
if col_name not in representative.columns:
Expand All @@ -507,13 +516,6 @@ def _should_include_column(col_name: str) -> bool:
f"Available columns: {list(representative.columns)}; " +
"Use ignore_unknown=True to ignore unknown columns.")

# include only numeric values (integer or float)
dtypes: Dict[str, Any] = representative.dtypes.to_dict()
if dtypes[col_name].kind not in ('i', 'f'):
if not _use_default_y:
raise ValueError(f"Invalid y: the column `{col_name}` "
f"has a non-numeric type: {dtypes[col_name]}.")
return False
return True

# Exclude non-numeric types that cannot be plotted or interpolated
Expand Down

0 comments on commit 675aa5c

Please sign in to comment.