Merge pull request #5 from perpetual-ml/parallelism

improved parallelism
perpetual-ml · Sep 19, 2024 · ba4e520 · ba4e520
2 parents 57a5131 + 96cd877
commit ba4e520
Show file tree

Hide file tree

Showing 30 changed files with 2,689 additions and 1,980 deletions.
diff --git a/Cargo.toml b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "perpetual"
-version = "0.3.8"
+version = "0.4.0"
 edition = "2021"
 authors = ["Mutlu Simsek <msimsek@perpetual-ml.com>"]
 homepage = "https://perpetual-ml.com"
@@ -22,11 +22,11 @@ codegen-units = 1
 [dependencies]
 rayon = "1.8"
 thiserror = "1.0"
-serde_json = { version = "1.0", features = ["float_roundtrip"] }
-serde = { version = "1.0", features = ["derive"] }
-rand = "0.8"
+serde_json = { version = "1.0.127", features = ["float_roundtrip"] }
+serde = { version = "1.0.209", features = ["derive"] }
+approx = "0.5"
 log = "0.4"
-hashbrown = { version = "0.14", features = ["serde", "rayon"] }
+rand = "0.8.5"
 
 [dev-dependencies]
 criterion = "0.5"

diff --git a/README.md b/README.md
@@ -7,33 +7,31 @@
 [![Python Versions](https://img.shields.io/pypi/pyversions/perpetual.svg?logo=python&logoColor=white)](https://pypi.org/project/perpetual)
 [![PyPI Version](https://img.shields.io/pypi/v/perpetual.svg?logo=pypi&logoColor=white)](https://pypi.org/project/perpetual)
 [![Crates.io Version](https://img.shields.io/crates/v/perpetual?logo=rust&logoColor=white)](https://crates.io/crates/perpetual)
-[![Discord](https://img.shields.io/discord/1247650900214812692?logo=discord&cacheSeconds=10)](https://discord.gg/vADKk9Wr)
+[![Static Badge](https://img.shields.io/badge/join-discord-blue?logo=discord)](https://discord.gg/aQmKKUuJ)
 
 </div>
 
 # Perpetual
 
-## _A self-generalizing gradient boosting machine which doesn't need hyperparameter optimization_
-
 PerpetualBooster is a gradient boosting machine (GBM) algorithm which doesn't need hyperparameter optimization unlike other GBM algorithms. Similar to AutoML libraries, it has a `budget` parameter. Increasing the `budget` parameter increases the predictive power of the algorithm and gives better results on unseen data. Start with a small budget (e.g. 1.0) and increase it (e.g. 2.0) once you are confident with your features. If you don't see any improvement with further increasing the `budget`, it means that you are already extracting the most predictive power out of your data.
 
 ## Benchmark
 
-Hyperparameter optimization usually takes 100 iterations with plain GBM algorithms. PerpetualBooster achieves the same accuracy in a single run. Thus, it achieves around 100x speed-up at the same accuracy with different `budget` levels and with different datasets. The speed-up might be slightly lower or significantly higher than 100x depending on the dataset.
+Hyperparameter optimization usually takes 100 iterations with plain GBM algorithms. PerpetualBooster achieves the same accuracy in a single run. Thus, it achieves up to 100x speed-up at the same accuracy with different `budget` levels and with different datasets.
 
 The following table summarizes the results for the [California Housing](https://scikit-learn.org/stable/modules/generated/sklearn.datasets.fetch_california_housing.html) dataset (regression):
 
-| Perpetual budget | LightGBM n_estimators | Perpetual mse | LightGBM mse | Perpetual cpu time | LightGBM cpu time | Speed-up |
-| ---------------- | --------------------- | ------------- | ------------ | ------------------ | ----------------- | -------- |
-| 1.0 | 100 | 0.192 | 0.192 | 7.6 | 978  | 129x |
-| 1.5 | 300 | 0.188 | 0.188 | 21.8 | 3066  | 141x |
-| 2.1 | 1000 | 0.185 | 0.186 | 86.0 | 8720  | 101x |
+| Perpetual budget | LightGBM n_estimators | Perpetual mse | LightGBM mse | Speed-up wall time | Speed-up cpu time |
+| ---------------- | --------------------- | ------------- | ------------ | ------------------ | ----------------- |
+| 1.0 | 100 | 0.192 | 0.192 | 54x | 56x |
+| 1.5 | 300 | 0.188 | 0.188 | 59x  | 58x |
+| 2.1 | 1000 | 0.185 | 0.186 | 42x  | 41x |
 
 The following table summarizes the results for the [Cover Types](https://scikit-learn.org/stable/modules/generated/sklearn.datasets.fetch_covtype.html) dataset (classification):
 
-| Perpetual budget | LightGBM n_estimators | Perpetual log loss | LightGBM log loss | Perpetual cpu time | LightGBM cpu time | Speed-up |
-| ---------------- | --------------------- | ------------------ | ----------------- | ------------------ | ----------------- | -------- |
-| 1.0 | 100 | 0.089 | 0.084 | 1653 | 124958  | 76x |
+| Perpetual budget | LightGBM n_estimators | Perpetual log loss | LightGBM log loss | Speed-up wall time | Speed-up cpu time |
+| ---------------- | --------------------- | ------------------ | ----------------- | ------------------ | ----------------- |
+| 0.9 | 100 | 0.091 | 0.084 | 72x  | 78x |
 
 You can reproduce the results using the scripts in the [examples](./python-package/examples) folder.
 
@@ -63,7 +61,7 @@ pip install perpetual
 To use in a Rust project, add the following to your Cargo.toml file to get the package from [crates.io](https://crates.io/crates/perpetual).
 
 ```toml
-perpetual = "0.3.8"
+perpetual = "0.4.0"
 ```
 
 ## Paper

diff --git a/benches/perpetual_benchmarks.rs b/benches/perpetual_benchmarks.rs
@@ -1,13 +1,12 @@
 use criterion::{black_box, criterion_group, criterion_main, Criterion};
-use hashbrown::HashMap;
 use perpetual::binning::bin_matrix;
 use perpetual::booster::PerpetualBooster;
 use perpetual::constants::N_NODES_ALLOCATED;
 use perpetual::constraints::ConstraintMap;
 use perpetual::data::Matrix;
-use perpetual::histogram::HistogramMatrix;
+use perpetual::histogram::{NodeHistogram, NodeHistogramOwned};
 use perpetual::objective::{LogLoss, ObjectiveFunction};
-use perpetual::splitter::MissingImputerSplitter;
+use perpetual::splitter::{MissingImputerSplitter, SplitInfo, SplitInfoSlice};
 use perpetual::tree::Tree;
 use perpetual::utils::{fast_f64_sum, fast_sum, naive_sum};
 use std::fs;
@@ -34,22 +33,26 @@ pub fn tree_benchmarks(c: &mut Criterion) {
  });
 
  let data = Matrix::new(&data_vec, y.len(), 5);
- let splitter = MissingImputerSplitter {
- eta: 0.3,
- allow_missing_splits: true,
- constraints_map: ConstraintMap::new(),
- };
+ let splitter = MissingImputerSplitter::new(0.3, true, ConstraintMap::new());
  let mut tree = Tree::new();
 
  let bindata = bin_matrix(&data, None, 300, f64::NAN, None).unwrap();
  let bdata = Matrix::new(&bindata.binned_data, data.rows, data.cols);
  let col_index: Vec<usize> = (0..data.cols).collect();
 
- let hist_init = HistogramMatrix::empty(&bdata, &bindata.cuts, &col_index, true, false);
- let mut hist_map: HashMap<usize, HistogramMatrix> = HashMap::with_capacity(N_NODES_ALLOCATED);
- for i in 0..N_NODES_ALLOCATED {
- hist_map.insert(i, hist_init.clone());
- }
+ let mut hist_tree_owned: Vec<NodeHistogramOwned> = (0..N_NODES_ALLOCATED)
+ .map(|_| NodeHistogramOwned::empty(&bindata.cuts, &col_index, false, true))
+ .collect();
+
+ let mut hist_tree: Vec<NodeHistogram> = hist_tree_owned
+ .iter_mut()
+ .map(|node_hist| NodeHistogram::from_owned(node_hist))
+ .collect();
+
+ let pool = rayon::ThreadPoolBuilder::new().num_threads(2).build().unwrap();
+
+ let mut split_info_vec: Vec<SplitInfo> = (0..col_index.len()).map(|_| SplitInfo::default()).collect();
+ let split_info_slice = SplitInfoSlice::new(&mut split_info_vec);
 
  tree.fit(
  &bdata,
@@ -58,7 +61,7 @@ pub fn tree_benchmarks(c: &mut Criterion) {
  &mut g,
  h.as_deref_mut(),
  &splitter,
- true,
+ &pool,
  Some(f32::MAX),
  &loss,
  &y,
@@ -67,8 +70,9 @@ pub fn tree_benchmarks(c: &mut Criterion) {
  None,
  None,
  false,
- &mut hist_map,
+ &mut hist_tree,
  None,
+ &split_info_slice,
  );
 
  println!("{}", tree.nodes.len());
@@ -83,7 +87,7 @@ pub fn tree_benchmarks(c: &mut Criterion) {
  black_box(&mut g),
  black_box(h.as_deref_mut()),
  black_box(&splitter),
- black_box(false),
+ black_box(&pool),
  Some(f32::MAX),
  black_box(&loss),
  black_box(&y),
@@ -92,8 +96,9 @@ pub fn tree_benchmarks(c: &mut Criterion) {
  None,
  None,
  false,
- black_box(&mut hist_map),
+ black_box(&mut hist_tree),
  None,
+ black_box(&split_info_slice),
  );
  })
  });
@@ -108,7 +113,7 @@ pub fn tree_benchmarks(c: &mut Criterion) {
  black_box(&mut g),
  black_box(h.as_deref_mut()),
  black_box(&splitter),
- black_box(false),
+ black_box(&pool),
  Some(f32::MAX),
  black_box(&loss),
  black_box(&y),
@@ -117,8 +122,9 @@ pub fn tree_benchmarks(c: &mut Criterion) {
  None,
  None,
  false,
- black_box(&mut hist_map),
+ black_box(&mut hist_tree),
  None,
+ black_box(&split_info_slice),
  );
  })
  });
@@ -137,7 +143,7 @@ pub fn tree_benchmarks(c: &mut Criterion) {
  // booster_train.sampling_mode(SamplingMode::Linear);
  booster_train.bench_function("train_booster_default", |b| {
  b.iter(|| {
- let mut booster = PerpetualBooster::default().set_parallel(false);
+ let mut booster = PerpetualBooster::default();
  booster
  .fit(
  black_box(&data),
@@ -153,7 +159,7 @@ pub fn tree_benchmarks(c: &mut Criterion) {
  });
  booster_train.bench_function("train_booster_with_column_sampling", |b| {
  b.iter(|| {
- let mut booster = PerpetualBooster::default().set_parallel(false);
+ let mut booster = PerpetualBooster::default();
  booster
  .fit(
  black_box(&data),

diff --git a/examples/cal_housing.rs b/examples/cal_housing.rs
@@ -1,6 +1,6 @@
 //! An example using the `california housing` dataset
 
-// cargo run --release --example cal_housing 0.1 0.3 2
+// cargo run --release --example cal_housing 1.0 1
 
 // cargo build --release --example cal_housing
 // hyperfine --runs 3 ./target/release/examples/cal_housing
@@ -14,6 +14,7 @@ use perpetual::{objective::Objective, Matrix, PerpetualBooster};
 use polars::prelude::*;
 use std::env;
 use std::error::Error;
+use std::time::SystemTime;
 
 pub fn mse(y_test: &[f64], y_pred: &[f64]) -> f32 {
  let mut error = 0.0;
@@ -26,7 +27,8 @@ pub fn mse(y_test: &[f64], y_pred: &[f64]) -> f32 {
 
 fn main() -> Result<(), Box<dyn Error>> {
  let args: Vec<String> = env::args().collect();
- let budget = &args[1].parse::<f32>().unwrap();
+ let budget = &args[1].parse::<f32>().unwrap_or(1.0);
+ let num_threads = &args[2].parse::<usize>().unwrap_or(1);
 
  let all_names = [
  "MedInc".to_string(),
@@ -116,8 +118,13 @@ fn main() -> Result<(), Box<dyn Error>> {
  // To provide parameters generate a default booster, and then use
  // the relevant `set_` methods for any parameters you would like to
  // adjust.
- let mut model = PerpetualBooster::default().set_objective(Objective::SquaredLoss);
+ let mut model = PerpetualBooster::default()
+ .set_objective(Objective::SquaredLoss)
+ .set_num_threads(Some(*num_threads));
+
+ let now = SystemTime::now();
  model.fit(&matrix_train, &y_train, None, None, *budget, None, None)?;
+ println!("now.elapsed: {:?}", now.elapsed().unwrap().as_secs_f32());
 
  let trees = model.get_prediction_trees();
  println!("n_rounds: {:?}", trees.len());
@@ -134,7 +141,9 @@ fn main() -> Result<(), Box<dyn Error>> {
  println!("mse_test: {:?}", error);
 
  println!("tree:");
- println!("{}", trees[0]);
+ for t in trees {
+ println!("{}", t);
+ }
 
  Ok(())
 }
diff --git a/python-package/Cargo.toml b/python-package/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "py-perpetual"
-version = "0.3.8"
+version = "0.4.0"
 edition = "2021"
 authors = ["Mutlu Simsek <msimsek@perpetual-ml.com>"]
 homepage = "https://perpetual-ml.com"
@@ -19,7 +19,7 @@ crate-type = ["cdylib", "rlib"]
 
 [dependencies]
 pyo3 = { version = "0.21", features = ["extension-module"] }
-perpetual_rs = {package="perpetual", version = "0.3.8", path = "../" }
+perpetual_rs = {package="perpetual", version = "0.4.0", path = "../" }
 numpy = "0.21"
 ndarray = "0.15"
 serde_plain = { version = "1.0" }

diff --git a/python-package/examples/benchmark_lgbm.py b/python-package/examples/benchmark_lgbm.py
@@ -80,7 +80,7 @@ def objective_function(
 
 if __name__ == "__main__":
  optuna.logging.set_verbosity(optuna.logging.WARNING)
- cal_housing = False # True -> California Housing, False -> Cover Types
+ cal_housing = True # True -> California Housing, False -> Cover Types
  n_estimators = 100
  n_trials = 100
  cpu_times = []

diff --git a/python-package/examples/benchmark_perpetual.py b/python-package/examples/benchmark_perpetual.py
@@ -24,9 +24,9 @@ def prepare_data(cal_housing, seed):
 
 
 if __name__ == "__main__":
- budget = 1.1
- parallel = False
- cal_housing = False # True -> California Housing, False -> Cover Types
+ budget = 1.0
+ num_threads = 2
+ cal_housing = True # True -> California Housing, False -> Cover Types
  cpu_times = []
  wall_times = []
  metrics = []
@@ -36,7 +36,9 @@ def prepare_data(cal_housing, seed):
  prepare_data(cal_housing, seed)
  )
 
- model = PerpetualBooster(objective=objective, parallel=parallel)
+ model = PerpetualBooster(
+ objective=objective, num_threads=num_threads, log_iterations=0
+ )
 
  start = process_time()
  tick = time()

diff --git a/python-package/examples/categorical_data.ipynb b/python-package/examples/categorical_data.ipynb
@@ -10,7 +10,6 @@
  "import numpy as np\n",
  "import pandas as pd\n",
  "from ucimlrepo import fetch_ucirepo\n",
- "from perpetual import PerpetualBooster\n",
  "from scipy.special import expit\n",
  "from lightgbm import LGBMClassifier\n",
  "from sklearn.metrics import log_loss, accuracy_score\n",
@@ -217,10 +216,10 @@
  "outputs": [],
  "source": [
  "print(f\"Number of finished trials: {len(study.trials)}\")\n",
- "print(f\"Best trial:\")\n",
+ "print(\"Best trial:\")\n",
  "print(f\" Number: {study.best_trial.number}\")\n",
  "print(f\" Value: {study.best_trial.value}\")\n",
- "print(f\" Params: \")\n",
+ "print(\" Params: \")\n",
  "for key, value in study.best_trial.params.items():\n",
  " print(f\" {key}: {value}\")"
  ]

diff --git a/python-package/examples/performance_benchmark.ipynb b/python-package/examples/performance_benchmark.ipynb
@@ -163,10 +163,10 @@
  "outputs": [],
  "source": [
  "print(f\"Number of finished trials: {len(study.trials)}\")\n",
- "print(f\"Best trial:\")\n",
+ "print(\"Best trial:\")\n",
  "print(f\" Number: {study.best_trial.number}\")\n",
  "print(f\" Value: {study.best_trial.value}\")\n",
- "print(f\" Params: \")\n",
+ "print(\" Params: \")\n",
  "for key, value in study.best_trial.params.items():\n",
  " print(f\" {key}: {value}\")"
  ]

diff --git a/python-package/pyproject.toml b/python-package/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "maturin"
 
 [project]
 name = "perpetual"
-version = "0.3.8"
+version = "0.4.0"
 description = "A self-generalizing gradient boosting machine which doesn't need hyperparameter optimization"
 license = { file = "LICENSE" }
 keywords = [