Skip to content

Commit

Permalink
FIX-modin-project#1851: Squash multiple LogicalProject nodes
Browse files Browse the repository at this point in the history
In case of multiple sequential LogicalProject nodes, Calcite
squashes them, but gets the column names from the first one.
To fix the issue, squash on the Modin side and keep the
names from the last one.

Signed-off-by: Andrey Pavlenko <andrey.a.pavlenko@gmail.com>
  • Loading branch information
AndreyPavlenko committed Jul 4, 2023
1 parent b532793 commit dc777a8
Show file tree
Hide file tree
Showing 4 changed files with 18 additions and 14 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -738,6 +738,18 @@ def _push(self, node):
node : CalciteBaseNode
A node to add.
"""
if (
len(self.res) != 0
and isinstance(node, CalciteProjectionNode)
and isinstance(self.res[-1], CalciteProjectionNode)
and all(isinstance(expr, CalciteInputRefExpr) for expr in node.exprs)
):
# Replace the last CalciteProjectionNode with this one and
# translate the input refs.
exprs = self.res.pop().exprs
node = CalciteProjectionNode(
node.fields, [exprs[expr.input] for expr in node.exprs]
)
self.res.append(node)

def _last(self):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2041,9 +2041,7 @@ def _materialize(self):
new_table, unsupported_cols=[], encode_col_names=False
)[0]
else:
partitions = self._partition_mgr_cls.run_exec_plan(
self._op, self._table_cols
)
partitions = self._partition_mgr_cls.run_exec_plan(self._op)

self._partitions = partitions
self._op = FrameNode(self)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -429,12 +429,12 @@ def to_empty_pandas_df(df):
index_cols = None
else:
index_cols = ColNameCodec.mangle_index_names(merged.index.names)
for orig_name, mangled_name in zip(merged.index.names, index_cols):
for name in index_cols:
# Using _dtypes here since it contains all column names,
# including the index.
df = left if mangled_name in left._dtypes else right
exprs[orig_name] = df.ref(mangled_name)
new_dtypes.append(df._dtypes[mangled_name])
df = left if name in left._dtypes else right
exprs[name] = df.ref(name)
new_dtypes.append(df._dtypes[name])

left_col_names = set(left.columns)
right_col_names = set(right.columns)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -227,16 +227,14 @@ def is_supported_dtype(dtype):
)

@classmethod
def run_exec_plan(cls, plan, columns):
def run_exec_plan(cls, plan):
"""
Run execution plan in HDK storage format to materialize frame.
Parameters
----------
plan : DFAlgNode
A root of an execution plan tree.
columns : list of str
A frame column names.
Returns
-------
Expand All @@ -256,10 +254,6 @@ def run_exec_plan(cls, plan, columns):
calcite_json = "execute calcite " + calcite_json
table = worker.executeRA(calcite_json)

# workaround for https://github.com/modin-project/modin/issues/1851
if DoUseCalcite.get():
table._column_names = [ColNameCodec.encode(c) for c in columns]

res = np.empty((1, 1), dtype=np.dtype(object))
res[0][0] = cls._partition_class(table)

Expand Down

0 comments on commit dc777a8

Please sign in to comment.