Skip to content
6 changes: 6 additions & 0 deletions bigframes/core/compile/aggregate_compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -350,6 +350,12 @@ def _(
return _apply_window_if_present(column.count(), window)


@compile_unary_agg.register
def _(op: agg_ops.ReverseRowNumberOp, column: ibis_types.NumericColumn, window=None):
row_count = _apply_window_if_present(ibis_ops.count(1), window)
return column - row_count


@compile_unary_agg.register
def _(
op: agg_ops.CutOp,
Expand Down
37 changes: 36 additions & 1 deletion bigframes/core/indexers.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
import bigframes.core.guid as guid
import bigframes.core.indexes as indexes
import bigframes.core.scalar
import bigframes.core.window_spec as windows
import bigframes.dataframe
import bigframes.dtypes
import bigframes.exceptions as bfe
Expand Down Expand Up @@ -477,6 +478,24 @@ def _iloc_getitem_series_or_dataframe(
Union[bigframes.dataframe.DataFrame, bigframes.series.Series],
series_or_dataframe.iloc[0:0],
)

# Check if both positive index and negative index are necessary
if isinstance(key, bigframes.series.Series):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

might need to check for bigframes.Index as well? Or maybe we should have a helper that helps identify an "remote" or "large" object we don't want to iterate over

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Updated to also check index type.

For large object it's may not be necessary, in cloudtop, I tried 1 million keys(which have the same sign), and this process took 0.03s.

# Avoid data download
is_key_unisigned = False
elif "shape" in series_or_dataframe._block.__dict__:
# If there is a cache, we convert all indices to positive.
row_count = series_or_dataframe._block.shape[0]
key = [k if k >= 0 else row_count + k for k in key]
is_key_unisigned = True
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This seems a bit fragile. We can use block.expr.node.row_count, but going though shape depends on some implementation details that might change. I don't know if we necessarily need this optimization at all?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Removed

else:
first_sign = key[0] >= 0
is_key_unisigned = True
for k in key:
if (k >= 0) != first_sign:
is_key_unisigned = False
break

if isinstance(series_or_dataframe, bigframes.series.Series):
original_series_name = series_or_dataframe.name
series_name = (
Expand All @@ -497,7 +516,23 @@ def _iloc_getitem_series_or_dataframe(
block = df._block
# explicitly set index to offsets, reset_index may not generate offsets in some modes
block, offsets_id = block.promote_offsets("temp_iloc_offsets_")
block = block.set_index([offsets_id])
pos_block = block.set_index([offsets_id])

if not is_key_unisigned or key[0] < 0:
neg_block, _ = block.apply_window_op(
offsets_id,
ops.aggregations.ReverseRowNumberOp(),
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we need a new op? Or could we have just used existing ops?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Updated, using SizeUnaryOp and SubOp instead.

window_spec=windows.rows(),
result_label="Reversed_Index",
)

neg_block = neg_block.set_index([neg_block.value_columns[-1]])

if is_key_unisigned:
block = pos_block if key[0] >= 0 else neg_block
else:
block = pos_block.concat([neg_block], how="inner")

df = bigframes.dataframe.DataFrame(block)

result = df.loc[key]
Expand Down
8 changes: 8 additions & 0 deletions bigframes/operations/aggregations.py
Original file line number Diff line number Diff line change
Expand Up @@ -335,6 +335,14 @@ def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionT
)


@dataclasses.dataclass(frozen=True)
class ReverseRowNumberOp(UnaryWindowOp):
name: ClassVar[str] = "size"

def output_type(self, *input_types):
return dtypes.INT_DTYPE


@dataclasses.dataclass(frozen=True)
class CutOp(UnaryWindowOp):
# TODO: Unintuitive, refactor into multiple ops?
Expand Down
4 changes: 2 additions & 2 deletions tests/system/small/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -4409,7 +4409,7 @@ def test_loc_list_multiindex(scalars_dfs_maybe_ordered):


def test_iloc_list(scalars_df_index, scalars_pandas_df_index):
index_list = [0, 0, 0, 5, 4, 7]
index_list = [0, 0, 0, 5, 4, 7, -2, -5, 3]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we support base case of iloc[neg_number]?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Seems works.


bf_result = scalars_df_index.iloc[index_list]
pd_result = scalars_pandas_df_index.iloc[index_list]
Expand All @@ -4423,7 +4423,7 @@ def test_iloc_list(scalars_df_index, scalars_pandas_df_index):
def test_iloc_list_partial_ordering(
scalars_df_partial_ordering, scalars_pandas_df_index
):
index_list = [0, 0, 0, 5, 4, 7]
index_list = [0, 0, 0, 5, 4, 7, -2, -5, 3]

bf_result = scalars_df_partial_ordering.iloc[index_list]
pd_result = scalars_pandas_df_index.iloc[index_list]
Expand Down