online-ml · kulbachcedric · Aug 21, 2024 · Jul 26, 2024 · Jul 26, 2024 · Jul 26, 2024
diff --git a/Makefile b/Makefile
@@ -3,6 +3,9 @@ COMMIT_HASH := $(shell eval git rev-parse HEAD)
 format:
 	pre-commit run --all-files
 
+test:
+	pytest
+
 execute-notebooks:
 	jupyter nbconvert --execute --to notebook --inplace docs/*/*/*.ipynb --ExecutePreprocessor.timeout=-1
 

diff --git a/README.md b/README.md
@@ -77,7 +77,7 @@ For further examples check out the <a href="https://online-ml.github.io/deep-riv
 ...     metric.update(y, y_pred)  # update the metric
 ...     model_pipeline.learn_one(x, y)  # make the model learn
 >>> print(f"Accuracy: {metric.get():.4f}")
-Accuracy: 0.6736
+Accuracy: 0.7264
 
 ```
 ### Multi Target Regression 
@@ -157,7 +157,7 @@ MicroAverage(MAE): 34.31
 ...     metric.update(y, score)
 ...
 >>> print(f"ROCAUC: {metric.get():.4f}")
-ROCAUC: 0.9017
+ROCAUC: 0.7812
 
 ```
 

diff --git a/benchmarks/config.py b/benchmarks/config.py
@@ -1,9 +1,17 @@
 from deep_river.classification import Classifier as TorchClassifier
-from deep_river.classification import RollingClassifier as TorchRollingClassifier
+from deep_river.classification import (
+    RollingClassifier as TorchRollingClassifier,
+)
 from deep_river.regression import Regressor as TorchRegressor
 from deep_river.regression import RollingRegressor as TorchRollingRegressor
-from model_zoo.torch import TorchMLPClassifier, TorchMLPRegressor, TorchLogisticRegression, \
-    TorchLinearRegression, TorchLSTMClassifier, TorchLSTMRegressor
+from model_zoo.torch import (
+    TorchMLPClassifier,
+    TorchMLPRegressor,
+    TorchLogisticRegression,
+    TorchLinearRegression,
+    TorchLSTMClassifier,
+    TorchLSTMRegressor,
+)
 from river import preprocessing, linear_model, neural_net, dummy
 from river import optim, evaluate, stats
 
@@ -21,7 +29,9 @@
     "Binary classification": {
         "Logistic regression": (
             preprocessing.StandardScaler()
-            | linear_model.LogisticRegression(optimizer=optim.SGD(LEARNING_RATE))
+            | linear_model.LogisticRegression(
+                optimizer=optim.SGD(LEARNING_RATE)
+            )
         )
     },
     "Multiclass classification": {
@@ -32,7 +42,7 @@
                 loss_fn="binary_cross_entropy",
                 optimizer_fn="sgd",
                 is_class_incremental=True,
-                lr=LEARNING_RATE
+                lr=LEARNING_RATE,
             )
         ),
         "Torch MLP": (
@@ -42,7 +52,7 @@
                 loss_fn="binary_cross_entropy",
                 optimizer_fn="sgd",
                 is_class_incremental=True,
-                lr=LEARNING_RATE
+                lr=LEARNING_RATE,
             )
         ),
         "Torch LSTM": (
@@ -104,4 +114,4 @@
         ),
         "[baseline] Mean predictor": dummy.StatisticRegressor(stats.Mean()),
     },
-}
+}
diff --git a/benchmarks/model_zoo/torch.py b/benchmarks/model_zoo/torch.py
@@ -1,4 +1,6 @@
 import torch
+
+
 class TorchMLPClassifier(torch.nn.Module):
     def __init__(self, n_features: int, hidden_size: int = 5):
         super().__init__()
@@ -13,6 +15,7 @@ def forward(self, x):
         x = self.softmax(x)
         return x
 
+
 class TorchMLPRegressor(torch.nn.Module):
 
     def __init__(self, n_features: int, hidden_size: int = 5):
@@ -26,6 +29,7 @@ def forward(self, x):
         x = self.nonlin(self.linear2(x))
         return x
 
+
 class TorchLogisticRegression(torch.nn.Module):
     def __init__(self, n_features: int, n_classes: int = 2):
         super().__init__()
@@ -36,20 +40,29 @@ def forward(self, X):
         X = self.linear(X)
         return self.softmax(X)
 
+
 class TorchLinearRegression(torch.nn.Module):
     def __init__(self, n_features: int):
         super().__init__()
         self.linear = torch.nn.Linear(n_features, 1)
 
     def forward(self, X):
         return self.linear(X)
+
+
 class TorchLSTMClassifier(torch.nn.Module):
     def __init__(self, n_features, num_layers=1, hidden_size=1):
         super().__init__()
-        self.n_features=n_features
+        self.n_features = n_features
         self.hidden_size = hidden_size
         self.num_layers = num_layers
-        self.lstm = torch.nn.LSTM(input_size=n_features, num_layers=num_layers, hidden_size=hidden_size, batch_first=False, bias=True)
+        self.lstm = torch.nn.LSTM(
+            input_size=n_features,
+            num_layers=num_layers,
+            hidden_size=hidden_size,
+            batch_first=False,
+            bias=True,
+        )
         self.fc = torch.nn.Linear(hidden_size, 1)
         self.softmax = torch.nn.Softmax(dim=-1)
 
@@ -58,15 +71,22 @@ def forward(self, X, **kwargs):
         X = self.fc(out[-1, :])
         return self.softmax(X)
 
+
 class TorchLSTMRegressor(torch.nn.Module):
     def __init__(self, n_features, num_layers=1, hidden_size=1):
         super().__init__()
-        self.n_features=n_features
+        self.n_features = n_features
         self.hidden_size = hidden_size
         self.num_layers = num_layers
-        self.lstm = torch.nn.LSTM(input_size=n_features, num_layers=num_layers, hidden_size=hidden_size, batch_first=False, bias=True)
+        self.lstm = torch.nn.LSTM(
+            input_size=n_features,
+            num_layers=num_layers,
+            hidden_size=hidden_size,
+            batch_first=False,
+            bias=True,
+        )
         self.fc = torch.nn.Linear(hidden_size, 1)
 
     def forward(self, X, **kwargs):
         out, (hn, cn) = self.lstm(X)
-        return self.fc(out[-1, :])
+        return self.fc(out[-1, :])
diff --git a/benchmarks/render.py b/benchmarks/render.py
@@ -11,30 +11,26 @@
 def render_df(df_path: Path) -> dict:
     df = pd.read_csv(str(df_path))
 
-    unique_datasets = list(df['dataset'].unique())
+    unique_datasets = list(df["dataset"].unique())
     measures = list(df.columns)[4:]
     res = {
         "$schema": "https://vega.github.io/schema/vega-lite/v5.json",
         "data": {
             "values": df.to_dict(orient="records")
-            #"url": f"benchmarks/{df_path.name}"
+            # "url": f"benchmarks/{df_path.name}"
         },
         "params": [
             {
                 "name": "models",
                 "select": {"type": "point", "fields": ["model"]},
-                "bind": "legend"
+                "bind": "legend",
             },
             {
                 "name": "Dataset",
                 "value": unique_datasets[0],
-                "bind": {"input": "select", "options": unique_datasets}
+                "bind": {"input": "select", "options": unique_datasets},
             },
-            {
-                "name": "grid",
-                "select": "interval",
-                "bind": "scales"
-            }
+            {"name": "grid", "select": "interval", "bind": "scales"},
         ],
         "transform": [
             {"filter": {"field": "dataset", "equal": {"expr": "Dataset"}}}
@@ -52,12 +48,12 @@ def render_df(df_path: Path) -> dict:
                         "titleFontSize": 18,
                         "labelFontSize": 18,
                         "title": "Instance",
-                    }
+                    },
                 },
                 "y": {
                     "field": {"repeat": "row"},
                     "type": "quantitative",
-                    "axis": {"titleFontSize": 18, "labelFontSize": 18}
+                    "axis": {"titleFontSize": 18, "labelFontSize": 18},
                 },
                 "color": {
                     "field": "model",
@@ -72,64 +68,72 @@ def render_df(df_path: Path) -> dict:
                 },
                 "opacity": {
                     "condition": {"param": "models", "value": 1},
-                    "value": 0.2
-                }
-            }
-        }
+                    "value": 0.2,
+                },
+            },
+        },
     }
     return res
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
 
-    if Path('details.json').exists():
-        if Path('../docs/benchmarks/details.json').exists():
-            Path('../docs/benchmarks/details.json').unlink()
-        shutil.move('details.json', '../docs/benchmarks/details.json')
-    details = json.load(open('../docs/benchmarks/details.json'))
+    if Path("details.json").exists():
+        if Path("../docs/benchmarks/details.json").exists():
+            Path("../docs/benchmarks/details.json").unlink()
+        shutil.move("details.json", "../docs/benchmarks/details.json")
+    details = json.load(open("../docs/benchmarks/details.json"))
 
-    with open("../docs/benchmarks/index.md", "w", encoding='utf-8') as f:
+    with open("../docs/benchmarks/index.md", "w", encoding="utf-8") as f:
         print_ = lambda x: print(x, file=f, end="\n\n")
         print_(
-    """---
+            """---
 hide:
 - navigation
 ---
 """
         )
 
-        print_('# Benchmark')
+        print_("# Benchmark")
 
         for track_name, track_details in details.items():
-            print_(f'## {track_name}')
-            csv_name = track_name.replace(' ', '_').lower()
-            if Path(f'{csv_name}.csv').exists():
-                if Path(f'../docs/benchmarks/{csv_name}.csv').exists():
-                    Path(f'../docs/benchmarks/{csv_name}.csv').unlink()
-                shutil.move(f'{csv_name}.csv', '../docs/benchmarks/', )
+            print_(f"## {track_name}")
+            csv_name = track_name.replace(" ", "_").lower()
+            if Path(f"{csv_name}.csv").exists():
+                if Path(f"../docs/benchmarks/{csv_name}.csv").exists():
+                    Path(f"../docs/benchmarks/{csv_name}.csv").unlink()
+                shutil.move(
+                    f"{csv_name}.csv",
+                    "../docs/benchmarks/",
+                )
 
-            df_path = Path(f'../docs/benchmarks/{csv_name}.csv')
+            df_path = Path(f"../docs/benchmarks/{csv_name}.csv")
             print_("```vegalite")
             print_(json.dumps(render_df(df_path), indent=2))
             print_("```")
 
-            print_('### Datasets')
+            print_("### Datasets")
             for dataset_name, dataset_details in track_details[
-                'Dataset'].items():
-                print_(f'<details>')
-                print_(f'<summary>{dataset_name}</summary>')
+                "Dataset"
+            ].items():
+                print_(f"<details>")
+                print_(f"<summary>{dataset_name}</summary>")
                 print_(pre(dataset_details))
-                print_(f'</details>')
-            print_('### Models')
-            for model_name, model_details in track_details['Model'].items():
-                print_(f'<details>')
-                print_(f'<summary>{model_name}</summary>')
+                print_(f"</details>")
+            print_("### Models")
+            for model_name, model_details in track_details["Model"].items():
+                print_(f"<details>")
+                print_(f"<summary>{model_name}</summary>")
                 print_(pre(model_details))
-                print_(f'</details>')
+                print_(f"</details>")
 
         print_("# Environment")
         print_(
-            pre(watermark(python=True,
-                          packages="river,numpy,scikit-learn,pandas,scipy",
-                          machine=True))
-        )
+            pre(
+                watermark(
+                    python=True,
+                    packages="river,numpy,scikit-learn,pandas,scipy",
+                    machine=True,
+                )
+            )
+        )
diff --git a/benchmarks/run.py b/benchmarks/run.py
@@ -15,19 +15,23 @@
 logger = logging.getLogger(__name__)
 from tqdm import tqdm
 
-def run_dataset(model_str,no_dataset, no_track):
+
+def run_dataset(model_str, no_dataset, no_track):
     model_name = model_str
     track = TRACKS[no_track]
     dataset = track.datasets[no_dataset]
     MODELS["Binary classification"].update(MODELS["Multiclass classification"])
     model = MODELS[track.name][model_name].clone()
-    print(f'Processing {model_str} on {dataset.__class__.__name__}')
+    print(f"Processing {model_str} on {dataset.__class__.__name__}")
 
     results = []
     track = copy.deepcopy(track)
     time = 0.0
-    for i in tqdm(track.run(model, dataset, n_checkpoints=N_CHECKPOINTS), total=N_CHECKPOINTS):
-        time += i['Time'].total_seconds()
+    for i in tqdm(
+        track.run(model, dataset, n_checkpoints=N_CHECKPOINTS),
+        total=N_CHECKPOINTS,
+    ):
+        time += i["Time"].total_seconds()
         res = {
             "step": i["Step"],
             "track": track.name,
@@ -37,17 +41,20 @@ def run_dataset(model_str,no_dataset, no_track):
         for k, v in i.items():
             if isinstance(v, metrics.base.Metric):
                 res[k] = v.get()
-        res["Memory in Mb"] = i['Memory'] / 1024 ** 2
+        res["Memory in Mb"] = i["Memory"] / 1024**2
         res["Time in s"] = time
         results.append(res)
         if time > 3600:
             break
     return results
 
+
 def run_track(models: List[str], no_track: int, n_workers: int = 50):
     pool = multiprocessing.Pool(processes=n_workers)
     track = TRACKS[no_track]
-    runs = list(itertools.product(models, range(len(track.datasets)), [no_track]))
+    runs = list(
+        itertools.product(models, range(len(track.datasets)), [no_track])
+    )
     results = []
 
     for val in pool.starmap(run_dataset, runs):
@@ -56,7 +63,7 @@ def run_track(models: List[str], no_track: int, n_workers: int = 50):
     pd.DataFrame(results).to_csv(f"./{csv_name}.csv", index=False)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
 
     MODELS["Binary classification"].update(MODELS["Multiclass classification"])
 
@@ -66,9 +73,10 @@ def run_track(models: List[str], no_track: int, n_workers: int = 50):
         details[track.name] = {"Dataset": {}, "Model": {}}
         for dataset in track.datasets:
             details[track.name]["Dataset"][dataset.__class__.__name__] = repr(
-                dataset)
+                dataset
+            )
         for model_name, model in MODELS[track.name].items():
             details[track.name]["Model"][model_name] = repr(model)
         with open("details.json", "w") as f:
             json.dump(details, f, indent=2)
-        run_track(models=MODELS[track.name].keys(), no_track=i, n_workers=10)
+        run_track(models=MODELS[track.name].keys(), no_track=i, n_workers=10)
-Original file line number
+Diff line change
@@ Expand Up / @@ -3,6 +3,9 @@ COMMIT_HASH := $(shell eval git rev-parse HEAD) @@
     format:
     	pre-commit run --all-files
+    test:
+    	pytest
     execute-notebooks:
     	jupyter nbconvert --execute --to notebook --inplace docs/*/*/*.ipynb --ExecutePreprocessor.timeout=-1
@@ Expand Down @@