Skip to content

Commit

Permalink
Fix merging when index name in meta missmatches actual name (#1119)
Browse files Browse the repository at this point in the history
Co-authored-by: Hendrik Makait <hendrik@makait.com>
  • Loading branch information
phofl and hendrikmakait authored Aug 12, 2024
1 parent 5cee29b commit 0bb77f1
Show file tree
Hide file tree
Showing 2 changed files with 21 additions and 2 deletions.
4 changes: 3 additions & 1 deletion dask_expr/_shuffle.py
Original file line number Diff line number Diff line change
Expand Up @@ -731,7 +731,9 @@ def _get_index(idx, obj):
if idx.ndim == 1:
idx = idx.to_frame()
elif index_shuffle:
idx = obj.index.to_frame()
# set default index name, otherwise we will end up with 0
name = {"name": "_index"} if idx == ["_index"] else {}
idx = obj.index.to_frame(**name)
else:
idx = _select_columns_or_index(obj, idx)
return idx
Expand Down
19 changes: 18 additions & 1 deletion dask_expr/tests/test_merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,9 @@

import numpy as np
import pytest
from dask import delayed

from dask_expr import Merge, from_pandas, merge, repartition
from dask_expr import Merge, from_delayed, from_pandas, merge, repartition
from dask_expr._expr import Filter, Projection
from dask_expr._merge import BroadcastJoin
from dask_expr._shuffle import Shuffle
Expand Down Expand Up @@ -1031,6 +1032,22 @@ def test_merge_after_rename(index):
assert_eq(result, expected, check_index=False)


def test_mismatching_meta():
df1 = from_pandas(pd.DataFrame({"value": [1, 2, 3]}))

def creator(i):
return pd.DataFrame(
{"value2": [1, 2, 3]}, index=pd.Index([1, 2, 3], name="index")
)

df2 = from_delayed(
[delayed(creator)(i) for i in range(3)],
meta=pd.DataFrame({"value2": [1, 2, 3]}),
)
result = df1.join(df2)
assert_eq(result, df1.compute().join(df2.compute()), check_index=False)


def test_merge_tuple_left_on():
df = pd.DataFrame(
{
Expand Down

0 comments on commit 0bb77f1

Please sign in to comment.