-
Notifications
You must be signed in to change notification settings - Fork 1.2k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Use window_size_bytes: auto
to specify automatic windowing
#3076
Changes from all commits
e6a6bbc
a9fe4b0
cbe24ce
d8395e8
3931d19
db4218b
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -15,7 +15,7 @@ | |
import copy | ||
import os | ||
import tempfile | ||
from typing import Optional | ||
from typing import Literal, Optional, Union | ||
|
||
import numpy as np | ||
import pandas as pd | ||
|
@@ -987,13 +987,12 @@ def num_partitions(self): | |
return 100 | ||
|
||
def create_dataset_pipeline( | ||
self, size: int, auto_window: bool = True, window_size_bytes: Optional[int] = None | ||
self, size: int, window_size_bytes: Optional[Union[int, Literal["auto"]]] = None | ||
) -> "DatasetPipeline": | ||
"""Create a dataset of specified size to test auto-sizing. | ||
|
||
Args: | ||
size: Total size of the dataset in bytes | ||
auto_window: Flag determining whether autosizing is enabled | ||
window_size_bytes: Pass to override the auto_window size | ||
|
||
Returns: | ||
|
@@ -1015,15 +1014,14 @@ def create_dataset_pipeline( | |
"output_features": [{"name": "out_column", "type": "binary"}], | ||
TRAINER: {"epochs": 1, BATCH_SIZE: 128}, | ||
} | ||
backend_config = {**RAY_BACKEND_CONFIG} | ||
backend_config = copy.deepcopy(RAY_BACKEND_CONFIG) | ||
backend_config["loader"] = {"window_size_bytes": window_size_bytes} | ||
Comment on lines
+1017
to
+1018
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Just curious - are we always guaranteed to have the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Under the hood, this update uses |
||
backend_config["preprocessor_kwargs"] = {"num_cpu": 1} | ||
model = LudwigModel(config, backend=backend_config) | ||
|
||
# Create a dataset using the model backend to ensure it | ||
# is initialized correctly. | ||
ds = model.backend.dataset_manager.create( | ||
df, config=model.config, training_set_metadata={}, auto_window=auto_window | ||
) | ||
ds = model.backend.dataset_manager.create(df, config=model.config, training_set_metadata={}) | ||
|
||
# To window without using a training session, we configure `DataParallelIngestSpec` to use the specified window | ||
# size and turn off other features (e.g., shuffle) that may incur computational overhead. | ||
|
@@ -1032,7 +1030,7 @@ def create_dataset_pipeline( | |
split=False, | ||
transform=False, | ||
use_stream_api=True, | ||
stream_window_size=ds.get_window_size_bytes(window_size_bytes=window_size_bytes), | ||
stream_window_size=ds.window_size_bytes, | ||
global_shuffle=False, | ||
) | ||
spec = DataParallelIngestSpec({"train": dataset_config}) | ||
|
@@ -1055,27 +1053,27 @@ def test_small_dataset(self, ray_cluster_2cpu): | |
Without automatic window sizing, the number of blocks in the pipeline should match the number of partitions in | ||
the Dask dataframe. | ||
""" | ||
pipe = self.create_dataset_pipeline(self.auto_window_size // 2) | ||
pipe = self.create_dataset_pipeline(self.auto_window_size // 2, window_size_bytes="auto") | ||
window = next(self.window_gen(pipe)) | ||
assert window.num_blocks() == self.num_partitions | ||
|
||
def test_large_dataset(self, ray_cluster_2cpu): | ||
"""A large dataset should trigger windowing.""" | ||
pipe = self.create_dataset_pipeline(self.auto_window_size * 2) | ||
pipe = self.create_dataset_pipeline(self.auto_window_size * 2, window_size_bytes="auto") | ||
for i, window in enumerate(self.window_gen(pipe)): | ||
assert window.num_blocks() < self.num_partitions | ||
if i > 100: | ||
break | ||
|
||
def test_window_autosizing_disabled(self, ray_cluster_2cpu): | ||
"""If window autosizing is disabled, no datasets should be windowed.""" | ||
pipe = self.create_dataset_pipeline(self.auto_window_size * 2, auto_window=False) | ||
pipe = self.create_dataset_pipeline(self.auto_window_size * 2, window_size_bytes=None) | ||
window = next(self.window_gen(pipe)) | ||
assert window.num_blocks() == self.num_partitions | ||
|
||
def test_user_window_size(self, ray_cluster_2cpu): | ||
"""If the user supplies a window size, do not autosize.""" | ||
auto_pipe = self.create_dataset_pipeline(self.auto_window_size * 2) | ||
auto_pipe = self.create_dataset_pipeline(self.auto_window_size * 2, window_size_bytes="auto") | ||
user_pipe = self.create_dataset_pipeline(self.auto_window_size * 2, window_size_bytes=self.auto_window_size * 4) | ||
windows = zip(self.window_gen(auto_pipe), self.window_gen(user_pipe)) | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Really nice use of literal, we should use that more in Ludwig