Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added compliance with multiple river features #93

Merged
merged 19 commits into from
Aug 21, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,9 @@ COMMIT_HASH := $(shell eval git rev-parse HEAD)
format:
pre-commit run --all-files

test:
pytest

execute-notebooks:
jupyter nbconvert --execute --to notebook --inplace docs/*/*/*.ipynb --ExecutePreprocessor.timeout=-1

Expand Down
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ For further examples check out the <a href="https://online-ml.github.io/deep-riv
... metric.update(y, y_pred) # update the metric
... model_pipeline.learn_one(x, y) # make the model learn
>>> print(f"Accuracy: {metric.get():.4f}")
Accuracy: 0.6736
Accuracy: 0.7264

```
### Multi Target Regression
Expand Down Expand Up @@ -157,7 +157,7 @@ MicroAverage(MAE): 34.31
... metric.update(y, score)
...
>>> print(f"ROCAUC: {metric.get():.4f}")
ROCAUC: 0.9017
ROCAUC: 0.7812

```

Expand Down
24 changes: 17 additions & 7 deletions benchmarks/config.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,17 @@
from deep_river.classification import Classifier as TorchClassifier
from deep_river.classification import RollingClassifier as TorchRollingClassifier
from deep_river.classification import (
RollingClassifier as TorchRollingClassifier,
)
from deep_river.regression import Regressor as TorchRegressor
from deep_river.regression import RollingRegressor as TorchRollingRegressor
from model_zoo.torch import TorchMLPClassifier, TorchMLPRegressor, TorchLogisticRegression, \
TorchLinearRegression, TorchLSTMClassifier, TorchLSTMRegressor
from model_zoo.torch import (
TorchMLPClassifier,
TorchMLPRegressor,
TorchLogisticRegression,
TorchLinearRegression,
TorchLSTMClassifier,
TorchLSTMRegressor,
)
from river import preprocessing, linear_model, neural_net, dummy
from river import optim, evaluate, stats

Expand All @@ -21,7 +29,9 @@
"Binary classification": {
"Logistic regression": (
preprocessing.StandardScaler()
| linear_model.LogisticRegression(optimizer=optim.SGD(LEARNING_RATE))
| linear_model.LogisticRegression(
optimizer=optim.SGD(LEARNING_RATE)
)
)
},
"Multiclass classification": {
Expand All @@ -32,7 +42,7 @@
loss_fn="binary_cross_entropy",
optimizer_fn="sgd",
is_class_incremental=True,
lr=LEARNING_RATE
lr=LEARNING_RATE,
)
),
"Torch MLP": (
Expand All @@ -42,7 +52,7 @@
loss_fn="binary_cross_entropy",
optimizer_fn="sgd",
is_class_incremental=True,
lr=LEARNING_RATE
lr=LEARNING_RATE,
)
),
"Torch LSTM": (
Expand Down Expand Up @@ -104,4 +114,4 @@
),
"[baseline] Mean predictor": dummy.StatisticRegressor(stats.Mean()),
},
}
}
30 changes: 25 additions & 5 deletions benchmarks/model_zoo/torch.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
import torch


class TorchMLPClassifier(torch.nn.Module):
def __init__(self, n_features: int, hidden_size: int = 5):
super().__init__()
Expand All @@ -13,6 +15,7 @@ def forward(self, x):
x = self.softmax(x)
return x


class TorchMLPRegressor(torch.nn.Module):

def __init__(self, n_features: int, hidden_size: int = 5):
Expand All @@ -26,6 +29,7 @@ def forward(self, x):
x = self.nonlin(self.linear2(x))
return x


class TorchLogisticRegression(torch.nn.Module):
def __init__(self, n_features: int, n_classes: int = 2):
super().__init__()
Expand All @@ -36,20 +40,29 @@ def forward(self, X):
X = self.linear(X)
return self.softmax(X)


class TorchLinearRegression(torch.nn.Module):
def __init__(self, n_features: int):
super().__init__()
self.linear = torch.nn.Linear(n_features, 1)

def forward(self, X):
return self.linear(X)


class TorchLSTMClassifier(torch.nn.Module):
def __init__(self, n_features, num_layers=1, hidden_size=1):
super().__init__()
self.n_features=n_features
self.n_features = n_features
self.hidden_size = hidden_size
self.num_layers = num_layers
self.lstm = torch.nn.LSTM(input_size=n_features, num_layers=num_layers, hidden_size=hidden_size, batch_first=False, bias=True)
self.lstm = torch.nn.LSTM(
input_size=n_features,
num_layers=num_layers,
hidden_size=hidden_size,
batch_first=False,
bias=True,
)
self.fc = torch.nn.Linear(hidden_size, 1)
self.softmax = torch.nn.Softmax(dim=-1)

Expand All @@ -58,15 +71,22 @@ def forward(self, X, **kwargs):
X = self.fc(out[-1, :])
return self.softmax(X)


class TorchLSTMRegressor(torch.nn.Module):
def __init__(self, n_features, num_layers=1, hidden_size=1):
super().__init__()
self.n_features=n_features
self.n_features = n_features
self.hidden_size = hidden_size
self.num_layers = num_layers
self.lstm = torch.nn.LSTM(input_size=n_features, num_layers=num_layers, hidden_size=hidden_size, batch_first=False, bias=True)
self.lstm = torch.nn.LSTM(
input_size=n_features,
num_layers=num_layers,
hidden_size=hidden_size,
batch_first=False,
bias=True,
)
self.fc = torch.nn.Linear(hidden_size, 1)

def forward(self, X, **kwargs):
out, (hn, cn) = self.lstm(X)
return self.fc(out[-1, :])
return self.fc(out[-1, :])
94 changes: 49 additions & 45 deletions benchmarks/render.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,30 +11,26 @@
def render_df(df_path: Path) -> dict:
df = pd.read_csv(str(df_path))

unique_datasets = list(df['dataset'].unique())
unique_datasets = list(df["dataset"].unique())
measures = list(df.columns)[4:]
res = {
"$schema": "https://vega.github.io/schema/vega-lite/v5.json",
"data": {
"values": df.to_dict(orient="records")
#"url": f"benchmarks/{df_path.name}"
# "url": f"benchmarks/{df_path.name}"
},
"params": [
{
"name": "models",
"select": {"type": "point", "fields": ["model"]},
"bind": "legend"
"bind": "legend",
},
{
"name": "Dataset",
"value": unique_datasets[0],
"bind": {"input": "select", "options": unique_datasets}
"bind": {"input": "select", "options": unique_datasets},
},
{
"name": "grid",
"select": "interval",
"bind": "scales"
}
{"name": "grid", "select": "interval", "bind": "scales"},
],
"transform": [
{"filter": {"field": "dataset", "equal": {"expr": "Dataset"}}}
Expand All @@ -52,12 +48,12 @@ def render_df(df_path: Path) -> dict:
"titleFontSize": 18,
"labelFontSize": 18,
"title": "Instance",
}
},
},
"y": {
"field": {"repeat": "row"},
"type": "quantitative",
"axis": {"titleFontSize": 18, "labelFontSize": 18}
"axis": {"titleFontSize": 18, "labelFontSize": 18},
},
"color": {
"field": "model",
Expand All @@ -72,64 +68,72 @@ def render_df(df_path: Path) -> dict:
},
"opacity": {
"condition": {"param": "models", "value": 1},
"value": 0.2
}
}
}
"value": 0.2,
},
},
},
}
return res


if __name__ == '__main__':
if __name__ == "__main__":

if Path('details.json').exists():
if Path('../docs/benchmarks/details.json').exists():
Path('../docs/benchmarks/details.json').unlink()
shutil.move('details.json', '../docs/benchmarks/details.json')
details = json.load(open('../docs/benchmarks/details.json'))
if Path("details.json").exists():
if Path("../docs/benchmarks/details.json").exists():
Path("../docs/benchmarks/details.json").unlink()
shutil.move("details.json", "../docs/benchmarks/details.json")
details = json.load(open("../docs/benchmarks/details.json"))

with open("../docs/benchmarks/index.md", "w", encoding='utf-8') as f:
with open("../docs/benchmarks/index.md", "w", encoding="utf-8") as f:
print_ = lambda x: print(x, file=f, end="\n\n")
print_(
"""---
"""---
hide:
- navigation
---
"""
)

print_('# Benchmark')
print_("# Benchmark")

for track_name, track_details in details.items():
print_(f'## {track_name}')
csv_name = track_name.replace(' ', '_').lower()
if Path(f'{csv_name}.csv').exists():
if Path(f'../docs/benchmarks/{csv_name}.csv').exists():
Path(f'../docs/benchmarks/{csv_name}.csv').unlink()
shutil.move(f'{csv_name}.csv', '../docs/benchmarks/', )
print_(f"## {track_name}")
csv_name = track_name.replace(" ", "_").lower()
if Path(f"{csv_name}.csv").exists():
if Path(f"../docs/benchmarks/{csv_name}.csv").exists():
Path(f"../docs/benchmarks/{csv_name}.csv").unlink()
shutil.move(
f"{csv_name}.csv",
"../docs/benchmarks/",
)

df_path = Path(f'../docs/benchmarks/{csv_name}.csv')
df_path = Path(f"../docs/benchmarks/{csv_name}.csv")
print_("```vegalite")
print_(json.dumps(render_df(df_path), indent=2))
print_("```")

print_('### Datasets')
print_("### Datasets")
for dataset_name, dataset_details in track_details[
'Dataset'].items():
print_(f'<details>')
print_(f'<summary>{dataset_name}</summary>')
"Dataset"
].items():
print_(f"<details>")
print_(f"<summary>{dataset_name}</summary>")
print_(pre(dataset_details))
print_(f'</details>')
print_('### Models')
for model_name, model_details in track_details['Model'].items():
print_(f'<details>')
print_(f'<summary>{model_name}</summary>')
print_(f"</details>")
print_("### Models")
for model_name, model_details in track_details["Model"].items():
print_(f"<details>")
print_(f"<summary>{model_name}</summary>")
print_(pre(model_details))
print_(f'</details>')
print_(f"</details>")

print_("# Environment")
print_(
pre(watermark(python=True,
packages="river,numpy,scikit-learn,pandas,scipy",
machine=True))
)
pre(
watermark(
python=True,
packages="river,numpy,scikit-learn,pandas,scipy",
machine=True,
)
)
)
26 changes: 17 additions & 9 deletions benchmarks/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,19 +15,23 @@
logger = logging.getLogger(__name__)
from tqdm import tqdm

def run_dataset(model_str,no_dataset, no_track):

def run_dataset(model_str, no_dataset, no_track):
model_name = model_str
track = TRACKS[no_track]
dataset = track.datasets[no_dataset]
MODELS["Binary classification"].update(MODELS["Multiclass classification"])
model = MODELS[track.name][model_name].clone()
print(f'Processing {model_str} on {dataset.__class__.__name__}')
print(f"Processing {model_str} on {dataset.__class__.__name__}")

results = []
track = copy.deepcopy(track)
time = 0.0
for i in tqdm(track.run(model, dataset, n_checkpoints=N_CHECKPOINTS), total=N_CHECKPOINTS):
time += i['Time'].total_seconds()
for i in tqdm(
track.run(model, dataset, n_checkpoints=N_CHECKPOINTS),
total=N_CHECKPOINTS,
):
time += i["Time"].total_seconds()
res = {
"step": i["Step"],
"track": track.name,
Expand All @@ -37,17 +41,20 @@ def run_dataset(model_str,no_dataset, no_track):
for k, v in i.items():
if isinstance(v, metrics.base.Metric):
res[k] = v.get()
res["Memory in Mb"] = i['Memory'] / 1024 ** 2
res["Memory in Mb"] = i["Memory"] / 1024**2
res["Time in s"] = time
results.append(res)
if time > 3600:
break
return results


def run_track(models: List[str], no_track: int, n_workers: int = 50):
pool = multiprocessing.Pool(processes=n_workers)
track = TRACKS[no_track]
runs = list(itertools.product(models, range(len(track.datasets)), [no_track]))
runs = list(
itertools.product(models, range(len(track.datasets)), [no_track])
)
results = []

for val in pool.starmap(run_dataset, runs):
Expand All @@ -56,7 +63,7 @@ def run_track(models: List[str], no_track: int, n_workers: int = 50):
pd.DataFrame(results).to_csv(f"./{csv_name}.csv", index=False)


if __name__ == '__main__':
if __name__ == "__main__":

MODELS["Binary classification"].update(MODELS["Multiclass classification"])

Expand All @@ -66,9 +73,10 @@ def run_track(models: List[str], no_track: int, n_workers: int = 50):
details[track.name] = {"Dataset": {}, "Model": {}}
for dataset in track.datasets:
details[track.name]["Dataset"][dataset.__class__.__name__] = repr(
dataset)
dataset
)
for model_name, model in MODELS[track.name].items():
details[track.name]["Model"][model_name] = repr(model)
with open("details.json", "w") as f:
json.dump(details, f, indent=2)
run_track(models=MODELS[track.name].keys(), no_track=i, n_workers=10)
run_track(models=MODELS[track.name].keys(), no_track=i, n_workers=10)
Loading
Loading