Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Compile time #154

Merged
merged 42 commits into from
Jan 24, 2025
Merged
Show file tree
Hide file tree
Changes from 38 commits
Commits
Show all changes
42 commits
Select commit Hold shift + click to select a range
89564c3
Add Numba CacheLocator for string-generated functions.
Jan 15, 2025
2514414
Add cache flag for njit and overloads.
Jan 15, 2025
48e0c39
Add caching options to overloads, njits, and convert string-generated…
Jan 15, 2025
f569f6f
[run CI]
Jan 15, 2025
414942c
Remove a print that I don't know how it got into the commit.
Jan 16, 2025
b67be9b
Hash whole function inside bodo_exec. Append hash to function name. …
Jan 16, 2025
c90bcd1
[run CI]
Jan 16, 2025
d936c8a
More conversion of exec to bodo_exec. Additional overload_method cac…
Jan 16, 2025
7dbec7d
[run CI]
Jan 16, 2025
3c4dce6
Remove from problematic caching.
Jan 16, 2025
9372a80
[run CI]
Jan 16, 2025
ebc5911
Rename impl.
Jan 16, 2025
60428eb
Convert exec to bodo_exec.
Jan 16, 2025
1e9e883
[run CI]
Jan 16, 2025
d7f94fc
Convert exec to bodo_exec.
Jan 17, 2025
9a31cc2
Rename impl to bodo_* because those functions go through _gen_init_df…
Jan 17, 2025
0efc535
[run CI]
Jan 17, 2025
0702e27
Fix bug in bodo_exec for nested functions. re-enable series_clip cac…
Jan 17, 2025
1ed0a11
More conversion of exec to bodo_exec. Additional overload_method cac…
Jan 16, 2025
cccc4ba
[run CI]
Jan 16, 2025
06aac30
Remove from problematic caching.
Jan 16, 2025
9fa09c2
[run CI]
Jan 16, 2025
21de8bd
[run CI]
Jan 16, 2025
e4bba46
[run CI]
Jan 17, 2025
af52be8
[run CI]
Jan 17, 2025
dd446cb
Add additional variables that control code generation for pq reader t…
Jan 17, 2025
98793ec
[run CI]
Jan 17, 2025
ef9c648
Fix bodo_exec call in pq reader.
Jan 17, 2025
fc7d0e1
[run CI]
Jan 17, 2025
0edcf0b
Update bodo/utils/utils.py
DrTodd13 Jan 21, 2025
9eca061
PR fixes.
Jan 21, 2025
b8b8483
[run CI]
Jan 21, 2025
3d21f67
Turn off caching for dataframe_astype due to weird pickling error abo…
Jan 21, 2025
193d9bf
[run CI]
Jan 21, 2025
2265f9c
Remove all __pycache__ from subdirs of bodo. After first test run, m…
Jan 22, 2025
b1ce6dd
Use bodo_exec for string-generated func. Rename some internal funcs …
Jan 22, 2025
ba82390
Add a function to test in a subprocess two times right after all the …
Jan 23, 2025
c80a50e
[run CI]
Jan 23, 2025
d68808f
Remove debugging code.
Jan 23, 2025
d1a9c96
Add comment for why we have a test in this file now.
Jan 23, 2025
12a3ff3
Add ^ to regex to make sure def that we replace is top-level function…
Jan 23, 2025
8f57c09
[run CI]
Jan 23, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
281 changes: 207 additions & 74 deletions bodo/hiframes/dataframe_impl.py

Large diffs are not rendered by default.

4 changes: 2 additions & 2 deletions bodo/hiframes/datetime_date_ext.py
Original file line number Diff line number Diff line change
Expand Up @@ -450,7 +450,7 @@ def fromordinal_impl(n): # pragma: no cover


# TODO: support general string formatting
@numba.njit
@numba.njit(cache=True)
def str_2d(a): # pragma: no cover
"""Takes in a number representing an date/time unit and formats it as a
2 character string, adding a leading zero if necessary."""
Expand Down Expand Up @@ -865,7 +865,7 @@ def lower_constant_datetime_date_arr(context, builder, typ, pyval):
return lir.Constant.literal_struct([data_const_arr, nulls_const_arr])


@numba.njit(no_cpython_wrapper=True)
@numba.njit(cache=True, no_cpython_wrapper=True)
def alloc_datetime_date_array(n): # pragma: no cover
data_arr = np.empty(n, dtype=np.int32)
# XXX: set all bits to not null since datetime.date array operations do not support
Expand Down
24 changes: 11 additions & 13 deletions bodo/hiframes/generic_pandas_coverage.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,13 +62,15 @@ def generate_simple_series_impl(

# Create the function definition line
if arg_defaults is None:
func_text = "def impl(" + ", ".join(arg_names) + "):\n"
func_text = "def bodo_generate_simple_series(" + ", ".join(arg_names) + "):\n"
else:
arg_def_strings = [
name if name not in arg_defaults else f"{name}={arg_defaults.get(name)}"
for name in arg_names
]
func_text = "def impl(" + ", ".join(arg_def_strings) + "):\n"
func_text = (
"def bodo_generate_simple_series(" + ", ".join(arg_def_strings) + "):\n"
)

# Extract the underlying array of the series as a variable called "data"
if isinstance(series_arg, bodo.hiframes.pd_series_ext.SeriesType):
Expand Down Expand Up @@ -178,8 +180,7 @@ def generate_simple_series_impl(
raise_bodo_error(
f"generate_simple_series_impl: unsupported output type {out_type}"
)
loc_vars = {}
exec(
return bodo.utils.utils.bodo_exec(
func_text,
{
"bodo": bodo,
Expand All @@ -188,10 +189,9 @@ def generate_simple_series_impl(
"np": np,
"out_dtype": out_arr_type,
},
loc_vars,
{},
globals(),
)
impl = loc_vars["impl"]
return impl


def generate_series_to_df_impl(
Expand Down Expand Up @@ -252,7 +252,7 @@ def generate_series_to_df_impl(
name if default is None else f"{name}={default}"
for name, default in zip(arg_names, arg_defaults)
]
func_text = "def impl(" + ", ".join(arg_strings) + "):\n"
func_text = "def bodo_generate_series_to_df(" + ", ".join(arg_strings) + "):\n"

# Extract the underlying array of the series as a variable called "data"
if isinstance(series_arg, bodo.hiframes.pd_series_ext.SeriesType):
Expand Down Expand Up @@ -357,11 +357,9 @@ def generate_series_to_df_impl(
for i in range(n_out):
glbls[f"out_dtype{i}"] = out_types[i]

loc_vars = {}
exec(
return bodo.utils.utils.bodo_exec(
func_text,
glbls,
loc_vars,
{},
globals(),
)
impl = loc_vars["impl"]
return impl
48 changes: 27 additions & 21 deletions bodo/hiframes/pd_dataframe_ext.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,10 @@
to_str_arr_if_dict_array,
unwrap_typeref,
)
from bodo.utils.utils import is_null_pointer
from bodo.utils.utils import (
bodo_exec,
is_null_pointer,
)

_json_write = types.ExternalFunction(
"json_write",
Expand Down Expand Up @@ -964,7 +967,7 @@

# workaround to support row["A"] case in df.apply()
# implements getitem for namedtuples if generated by Bodo
@overload(operator.getitem, no_unliteral=True)
@overload(operator.getitem, no_unliteral=True, jit_options={"cache": True})
def namedtuple_getitem_overload(tup, idx):
if isinstance(tup, types.BaseNamedTuple) and is_overload_constant_str(idx):
field_idx = get_overload_const_str(idx)
Expand Down Expand Up @@ -1193,7 +1196,7 @@
pass


@overload(pushdown_safe_init_df, inline="never")
@overload(pushdown_safe_init_df, inline="never", jit_options={"cache": True})
def overload_pushdown_safe_init_df(table, colNames):
"""
A wrapper for init_dataframe to coerce a table to a DataFrame while preventing filter pushdown
Expand All @@ -1208,11 +1211,11 @@
[DataFrame] the data from the table wrapped in a DataFrame.
"""

def impl(table, colNames):
def bodo_pushdown_safe_init_df(table, colNames):

Check warning on line 1214 in bodo/hiframes/pd_dataframe_ext.py

View check run for this annotation

Codecov / codecov/patch

bodo/hiframes/pd_dataframe_ext.py#L1214

Added line #L1214 was not covered by tests
index = bodo.hiframes.pd_index_ext.init_range_index(0, len(table), 1, None)
return bodo.hiframes.pd_dataframe_ext.init_dataframe((table,), index, colNames)

return impl
return bodo_pushdown_safe_init_df

Check warning on line 1218 in bodo/hiframes/pd_dataframe_ext.py

View check run for this annotation

Codecov / codecov/patch

bodo/hiframes/pd_dataframe_ext.py#L1218

Added line #L1218 was not covered by tests


@intrinsic
Expand Down Expand Up @@ -2267,7 +2270,7 @@
return data_tup


@overload(pd.DataFrame, inline="always", no_unliteral=True)
@overload(pd.DataFrame, inline="always", no_unliteral=True, jit_options={"cache": True})
def pd_dataframe_overload(data=None, index=None, columns=None, dtype=None, copy=False):
# TODO: support other input combinations
# TODO: error checking
Expand All @@ -2279,18 +2282,14 @@
col_args, data_args, index_arg = _get_df_args(data, index, columns, dtype, copy)
col_var = ColNamesMetaType(tuple(col_args))

func_text = (
"def _init_df(data=None, index=None, columns=None, dtype=None, copy=False):\n"
)
func_text = "def bodo_init_df(data=None, index=None, columns=None, dtype=None, copy=False):\n"
func_text += f" return bodo.hiframes.pd_dataframe_ext.init_dataframe({data_args}, {index_arg}, __col_name_meta_value_pd_overload)\n"
loc_vars = {}
exec(
return bodo_exec(
func_text,
{"bodo": bodo, "np": np, "__col_name_meta_value_pd_overload": col_var},
loc_vars,
{},
globals(),
)
_init_df = loc_vars["_init_df"]
return _init_df


@intrinsic
Expand Down Expand Up @@ -3136,7 +3135,13 @@

# TODO: jitoptions for overload_method and infer_global
# (no_cpython_wrapper to avoid error for iterator object)
@overload_method(DataFrameType, "itertuples", inline="always", no_unliteral=True)
@overload_method(
DataFrameType,
"itertuples",
inline="always",
no_unliteral=True,
jit_options={"cache": True},
)
def itertuples_overload(df, index=True, name="Pandas"):
check_runtime_cols_unsupported(df, "DataFrame.itertuples()")
unsupported_args = {"index": index, "name": name}
Expand Down Expand Up @@ -3761,7 +3766,9 @@
return pandas_metadata


@overload_method(DataFrameType, "to_parquet", no_unliteral=True)
@overload_method(
DataFrameType, "to_parquet", no_unliteral=True, jit_options={"cache": True}
)
def to_parquet_overload(
df,
path,
Expand Down Expand Up @@ -3945,7 +3952,7 @@
# wrap the name with quotation mark to indicate it is a string
pandas_metadata_str = pandas_metadata_str.replace('"%s"', "%s")

func_text = "def df_to_parquet(df, path, engine='auto', compression='snappy', index=None, partition_cols=None, storage_options=None, row_group_size=-1, _bodo_file_prefix='part-', _bodo_timestamp_tz=None, _is_parallel=False):\n"
func_text = "def bodo_df_to_parquet(df, path, engine='auto', compression='snappy', index=None, partition_cols=None, storage_options=None, row_group_size=-1, _bodo_file_prefix='part-', _bodo_timestamp_tz=None, _is_parallel=False):\n"

# Why we are calling drop_duplicates_local_dictionary on all dict encoded arrays?
# Arrow doesn't support writing DictionaryArrays with nulls in the dictionary.
Expand Down Expand Up @@ -4136,13 +4143,12 @@
"decode_if_dict_table": decode_if_dict_table,
}
glbls.update(extra_globals)
exec(
return bodo_exec(
func_text,
glbls,
loc_vars,
globals(),
)
df_to_parquet = loc_vars["df_to_parquet"]
return df_to_parquet


# -------------------------------------- to_sql ------------------------------------------
Expand Down Expand Up @@ -4260,7 +4266,7 @@
ev.finalize()


@numba.njit
@numba.njit(cache=True)
def to_sql_exception_guard_encaps(
df,
name,
Expand Down
Loading
Loading