Skip to content

BUG: Fix infer_dtype result for float with embedded pd.NA #61624

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v3.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -711,6 +711,7 @@ Timezones

Numeric
^^^^^^^
- Bug in :func:`api.types.infer_dtype` returning "mixed-integer-float" for float and ``pd.NA`` mix (:issue:`61621`)
- Bug in :meth:`DataFrame.corr` where numerical precision errors resulted in correlations above ``1.0`` (:issue:`61120`)
- Bug in :meth:`DataFrame.cov` raises a ``TypeError`` instead of returning potentially incorrect results or other errors (:issue:`53115`)
- Bug in :meth:`DataFrame.quantile` where the column type was not preserved when ``numeric_only=True`` with a list-like ``q`` produced an empty result (:issue:`59035`)
Expand Down
2 changes: 1 addition & 1 deletion pandas/_libs/lib.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ def is_time_array(values: np.ndarray, skipna: bool = ...): ...
def is_date_array(values: np.ndarray, skipna: bool = ...): ...
def is_datetime_array(values: np.ndarray, skipna: bool = ...): ...
def is_string_array(values: np.ndarray, skipna: bool = ...): ...
def is_float_array(values: np.ndarray): ...
def is_float_array(values: np.ndarray, skipna: bool = ...): ...
def is_integer_array(values: np.ndarray, skipna: bool = ...): ...
def is_bool_array(values: np.ndarray, skipna: bool = ...): ...
def fast_multiget(
Expand Down
8 changes: 5 additions & 3 deletions pandas/_libs/lib.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -1751,7 +1751,7 @@ def infer_dtype(value: object, skipna: bool = True) -> str:
return "complex"

elif util.is_float_object(val):
if is_float_array(values):
if is_float_array(values, skipna=skipna):
return "floating"
elif is_integer_float_array(values, skipna=skipna):
if is_integer_na_array(values, skipna=skipna):
Expand Down Expand Up @@ -1953,9 +1953,11 @@ cdef class FloatValidator(Validator):


# Note: only python-exposed for tests
cpdef bint is_float_array(ndarray values):
cpdef bint is_float_array(ndarray values, bint skipna=True):
cdef:
FloatValidator validator = FloatValidator(values.size, values.dtype)
FloatValidator validator = FloatValidator(values.size,
values.dtype,
skipna=skipna)
return validator.validate(values)


Expand Down
5 changes: 1 addition & 4 deletions pandas/core/dtypes/cast.py
Original file line number Diff line number Diff line change
Expand Up @@ -1086,10 +1086,7 @@ def convert_dtypes(
elif (
infer_objects
and input_array.dtype == object
and (
isinstance(inferred_dtype, str)
and inferred_dtype == "mixed-integer-float"
)
and inferred_dtype == "floating"
):
inferred_dtype = pandas_dtype_func("Float64")

Expand Down
9 changes: 9 additions & 0 deletions pandas/tests/dtypes/test_inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -1387,6 +1387,15 @@ def test_infer_dtype_period_with_na(self, na_value):
arr = np.array([na_value, Period("2011-01", freq="D"), na_value])
assert lib.infer_dtype(arr, skipna=True) == "period"

@pytest.mark.parametrize("na_value", [pd.NA, np.nan])
def test_infer_dtype_numeric_with_na(self, na_value):
# GH61621
arr = Series([1, 2, na_value], dtype=object)
assert lib.infer_dtype(arr, skipna=True) == "integer"

arr = Series([1.0, 2.0, na_value], dtype=object)
assert lib.infer_dtype(arr, skipna=True) == "floating"

def test_infer_dtype_all_nan_nat_like(self):
arr = np.array([np.nan, np.nan])
assert lib.infer_dtype(arr, skipna=True) == "floating"
Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/extension/test_arrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -3090,7 +3090,7 @@ def test_infer_dtype_pyarrow_dtype(data, request):
res = lib.infer_dtype(data)
assert res != "unknown-array"

if data._hasna and res in ["floating", "datetime64", "timedelta64"]:
if data._hasna and res in ["datetime64", "timedelta64"]:
mark = pytest.mark.xfail(
reason="in infer_dtype pd.NA is not ignored in these cases "
"even with skipna=True in the list(data) check below"
Expand Down
Loading