Skip to content

Commit a160ae6

Browse files
[backport 2.3.x] BUG[string]: incorrect index downcast in DataFrame.join (#61771) (#61800)
* BUG[string]: incorrect index downcast in DataFrame.join (#61771) Co-authored-by: Joris Van den Bossche <[email protected]> * avoid warning --------- Co-authored-by: jbrockmendel <[email protected]>
1 parent cf5db68 commit a160ae6

File tree

3 files changed

+10
-16
lines changed

3 files changed

+10
-16
lines changed

doc/source/whatsnew/v2.3.1.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,7 @@ correctly, rather than defaulting to ``object`` dtype. For example:
5757
Bug fixes
5858
^^^^^^^^^
5959
- Bug in :meth:`.DataFrameGroupBy.min`, :meth:`.DataFrameGroupBy.max`, :meth:`.Resampler.min`, :meth:`.Resampler.max` where all NA values of string dtype would return float instead of string dtype (:issue:`60810`)
60+
- Bug in :meth:`DataFrame.join` incorrectly downcasting object-dtype indexes (:issue:`61771`)
6061
- Bug in :meth:`DataFrame.sum` with ``axis=1``, :meth:`.DataFrameGroupBy.sum` or :meth:`.SeriesGroupBy.sum` with ``skipna=True``, and :meth:`.Resampler.sum` with all NA values of :class:`StringDtype` resulted in ``0`` instead of the empty string ``""`` (:issue:`60229`)
6162
- Fixed bug in :meth:`DataFrame.explode` and :meth:`Series.explode` where methods would fail with ``dtype="str"`` (:issue:`61623`)
6263
- Fixed bug in unpickling objects pickled in pandas versions pre-2.3.0 that used :class:`StringDtype` (:issue:`61763`).

pandas/core/reshape/merge.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1076,13 +1076,13 @@ def _maybe_add_join_keys(
10761076
# if we have an all missing left_indexer
10771077
# make sure to just use the right values or vice-versa
10781078
if left_indexer is not None and (left_indexer == -1).all():
1079-
key_col = Index(rvals)
1079+
key_col = Index(rvals, dtype=rvals.dtype, copy=False)
10801080
result_dtype = rvals.dtype
10811081
elif right_indexer is not None and (right_indexer == -1).all():
1082-
key_col = Index(lvals)
1082+
key_col = Index(lvals, dtype=lvals.dtype, copy=False)
10831083
result_dtype = lvals.dtype
10841084
else:
1085-
key_col = Index(lvals)
1085+
key_col = Index(lvals, dtype=lvals.dtype, copy=False)
10861086
if left_indexer is not None:
10871087
mask_left = left_indexer == -1
10881088
key_col = key_col.where(~mask_left, rvals)
@@ -1112,7 +1112,8 @@ def _maybe_add_join_keys(
11121112

11131113
result.set_index(idx_list, inplace=True)
11141114
else:
1115-
result.index = Index(key_col, name=name)
1115+
key_col.name = name
1116+
result.index = key_col
11161117
else:
11171118
result.insert(i, name or f"key_{i}", key_col)
11181119

pandas/tests/copy_view/test_functions.py

Lines changed: 4 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,6 @@
11
import numpy as np
22
import pytest
33

4-
from pandas._config import using_string_dtype
5-
6-
from pandas.compat import HAS_PYARROW
7-
84
from pandas import (
95
DataFrame,
106
Index,
@@ -317,13 +313,9 @@ def test_merge_copy_keyword(using_copy_on_write, copy):
317313
assert not np.shares_memory(get_array(df2, "b"), get_array(result, "b"))
318314

319315

320-
@pytest.mark.xfail(
321-
using_string_dtype() and HAS_PYARROW,
322-
reason="TODO(infer_string); result.index infers str dtype while both "
323-
"df1 and df2 index are object.",
324-
)
325-
def test_join_on_key(using_copy_on_write):
326-
df_index = Index(["a", "b", "c"], name="key", dtype=object)
316+
@pytest.mark.parametrize("dtype", [object, "str"])
317+
def test_join_on_key(dtype, using_copy_on_write):
318+
df_index = Index(["a", "b", "c"], name="key", dtype=dtype)
327319

328320
df1 = DataFrame({"a": [1, 2, 3]}, index=df_index.copy(deep=True))
329321
df2 = DataFrame({"b": [4, 5, 6]}, index=df_index.copy(deep=True))
@@ -336,7 +328,7 @@ def test_join_on_key(using_copy_on_write):
336328
if using_copy_on_write:
337329
assert np.shares_memory(get_array(result, "a"), get_array(df1, "a"))
338330
assert np.shares_memory(get_array(result, "b"), get_array(df2, "b"))
339-
assert np.shares_memory(get_array(result.index), get_array(df1.index))
331+
assert tm.shares_memory(get_array(result.index), get_array(df1.index))
340332
assert not np.shares_memory(get_array(result.index), get_array(df2.index))
341333
else:
342334
assert not np.shares_memory(get_array(result, "a"), get_array(df1, "a"))

0 commit comments

Comments
 (0)