diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 3762d989..e0c13bcc 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -126,7 +126,7 @@ jobs: cargo +nightly rustdoc --all-features -- --D warnings --cfg doc_cfg -A unexpected_cfgs - name: msrv run: | - cargo +nightly generate-lockfile -Zmsrv-policy --config "resolver.something-like-precedence='something-like-rust-version'" + cargo +nightly generate-lockfile -Zmsrv-policy --config "resolver.incompatible-rust-versions='fallback'" cat Cargo.lock cargo +1.70.0 check -p autograph -p neural-network-mnist-example --all-features --all-targets -v cargo +1.70.0 check -p neural-network-benches --all-targets -v diff --git a/README.md b/README.md index 1a23cb78..a6b38b4a 100644 --- a/README.md +++ b/README.md @@ -116,21 +116,21 @@ _NVIDIA GeForce GTX 1060 with Max-Q Design_ ## LeNet5(training, batch_size = 100) -| | `autograph` | `tch` | -| :---------------- | :------------------------- | :------------------------------- | -| **`bf16_host`** | `482.80 ms` (✅ **1.00x**) | `75.30 ms` (🚀 **6.41x faster**) | -| **`f32_host`** | `5.44 ms` (✅ **1.00x**) | `3.09 ms` (✅ **1.76x faster**) | -| **`bf16_device`** | `1.76 ms` (✅ **1.00x**) | `17.99 ms` (❌ _10.20x slower_) | -| **`f32_device`** | `1.75 ms` (✅ **1.00x**) | `1.20 ms` (✅ **1.45x faster**) | +| | `autograph` | `tch` | `candle` | +|:------------------|:--------------------------|:---------------------------------|:-------------------------------- | +| **`bf16_host`** | `498.54 ms` (✅ **1.00x**) | `75.26 ms` (🚀 **6.62x faster**) | `N/A` | +| **`f32_host`** | `8.25 ms` (✅ **1.00x**) | `3.14 ms` (🚀 **2.63x faster**) | `34.17 ms` (❌ *4.14x slower*) | +| **`bf16_device`** | `1.76 ms` (✅ **1.00x**) | `17.63 ms` (❌ *10.02x slower*) | `N/A` | +| **`f32_device`** | `1.73 ms` (✅ **1.00x**) | `1.19 ms` (✅ **1.45x faster**) | `9.76 ms` (❌ *5.64x slower*) | ## LeNet5(inference, batch_size = 1,000) -| | `autograph` | `tch` | -| :---------------- | :------------------------ | :-------------------------------- | -| **`bf16_host`** | `1.78 s` (✅ **1.00x**) | `192.75 ms` (🚀 **9.25x faster**) | -| **`f32_host`** | `12.23 ms` (✅ **1.00x**) | `9.57 ms` (✅ **1.28x faster**) | -| **`bf16_device`** | `4.62 ms` (✅ **1.00x**) | `48.72 ms` (❌ _10.54x slower_) | -| **`f32_device`** | `4.76 ms` (✅ **1.00x**) | `1.84 ms` (🚀 **2.58x faster**) | +| | `autograph` | `tch` | `candle` | +|:------------------|:-------------------------|:---------------------------------|:-------------------------------- | +| **`bf16_host`** | `1.81 s` (✅ **1.00x**) | `193.60 ms` (🚀 **9.37x faster**) | `N/A` | +| **`f32_host`** | `15.56 ms` (✅ **1.00x**) | `9.46 ms` (✅ **1.64x faster**) | `94.23 ms` (❌ *6.06x slower*) | +| **`bf16_device`** | `4.65 ms` (✅ **1.00x**) | `48.63 ms` (❌ *10.46x slower*) | `N/A` | +| **`f32_device`** | `4.65 ms` (✅ **1.00x**) | `1.84 ms` (🚀 **2.52x faster**) | `10.81 ms` (❌ *2.33x slower*) | See the [Neural Network](benches/neural-network-benches) benchmark. diff --git a/benches/neural-network-benches/Cargo.toml b/benches/neural-network-benches/Cargo.toml index bd1199ba..f09e588a 100644 --- a/benches/neural-network-benches/Cargo.toml +++ b/benches/neural-network-benches/Cargo.toml @@ -18,6 +18,8 @@ tch = { version = "0.12.0", optional = true } criterion = { version = "0.4.0", default-features = false } anyhow = { workspace = true } bytemuck = { workspace = true, optional = true } +candle-nn = { version = "0.6.0", optional = true } +candle-core = { version = "0.6.0", optional = true } [dev-dependencies] num-format.workspace = true @@ -25,8 +27,9 @@ num-format.workspace = true [features] default = ["device"] device = ["autograph/device"] -cuda = [] +cuda = ["candle-nn?/cuda"] tch = ["dep:tch", "dep:bytemuck"] +candle = ["dep:candle-nn", "dep:candle-core"] [[bench]] name = "benchmarks" diff --git a/benches/neural-network-benches/benches/benchmarks.rs b/benches/neural-network-benches/benches/benchmarks.rs index 05cb0a0f..184988a9 100644 --- a/benches/neural-network-benches/benches/benchmarks.rs +++ b/benches/neural-network-benches/benches/benchmarks.rs @@ -1,11 +1,51 @@ use autograph::krnl::{device::Device, scalar::ScalarType}; use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion}; use neural_network_benches::autograph_backend; +#[cfg(feature = "candle")] +use neural_network_benches::candle_backend; #[cfg(feature = "tch")] use neural_network_benches::tch_backend; use num_format::{Locale, ToFormattedString}; use std::str::FromStr; +fn autograph_devices( + #[cfg_attr(not(feature = "device"), allow(unused))] index: usize, +) -> impl IntoIterator { + [ + Device::host(), + #[cfg(feature = "device")] + Device::builder().index(index).build().unwrap(), + ] +} + +#[cfg(feature = "tch")] +fn tch_devices( + #[cfg_attr(not(feature = "cuda"), allow(unused))] index: usize, +) -> impl IntoIterator { + use tch::Device; + + [ + Device::Cpu, + #[cfg(feature = "cuda")] + Device::Cuda(index), + ] +} + +#[cfg(feature = "candle")] +fn candle_devices( + #[cfg_attr(not(feature = "cuda"), allow(unused))] index: usize, +) -> impl IntoIterator { + use candle_core::Device; + #[cfg(feature = "cuda")] + use candle_core::{backend::BackendDevice, CudaDevice}; + + [ + Device::Cpu, + #[cfg(feature = "cuda")] + Device::Cuda(CudaDevice::new(index).unwrap()), + ] +} + pub fn criterion_benchmark(c: &mut Criterion) { let device_index = if cfg!(feature = "device") { let krnl_device = std::env::var("KRNL_DEVICE"); @@ -20,8 +60,7 @@ pub fn criterion_benchmark(c: &mut Criterion) { } else { 0 }; - - #[cfg_attr(not(feature = "cuda"), allow(unused))] + #[allow(unused)] let cuda_device_index = if cfg!(feature = "cuda") { let cuda_device = std::env::var("CUDA_DEVICE"); println!("CUDA_DEVICE = {cuda_device:?}"); @@ -35,7 +74,6 @@ pub fn criterion_benchmark(c: &mut Criterion) { } else { 0 }; - { // training let train_batch_size = 100; @@ -45,15 +83,7 @@ pub fn criterion_benchmark(c: &mut Criterion) { )); { let scalar_types = [ScalarType::BF16, ScalarType::F32]; - let devices = if cfg!(feature = "device") { - vec![ - Device::host(), - Device::builder().index(device_index).build().unwrap(), - ] - } else { - vec![Device::host()] - }; - for device in devices { + for device in autograph_devices(device_index) { let device_name = if device.is_device() { "device" } else { "host" }; for scalar_type in scalar_types { let scalar_name = scalar_type.name(); @@ -73,15 +103,10 @@ pub fn criterion_benchmark(c: &mut Criterion) { } #[cfg(feature = "tch")] { - use tch::{kind::Kind, Device}; + use tch::kind::Kind; let kinds = [Kind::BFloat16, Kind::Float]; - let devices = if cfg!(feature = "cuda") { - vec![Device::Cpu, Device::Cuda(cuda_device_index)] - } else { - vec![Device::Cpu] - }; - for device in devices { + for device in tch_devices(cuda_device_index) { let device_name = if device.is_cuda() { "device" } else { "host" }; for kind in kinds { let kind_name = match kind { @@ -104,8 +129,35 @@ pub fn criterion_benchmark(c: &mut Criterion) { } } } - } + #[cfg(feature = "candle")] + { + use candle_core::DType; + let dtypes = [/* Not Supported DType::BF16,*/ DType::F32]; + for device in candle_devices(cuda_device_index) { + let device_name = if device.is_cuda() { "device" } else { "host" }; + for dtype in dtypes { + let scalar_name = match dtype { + //DType::BF16 => "bf16", + DType::F32 => "f32", + _ => unreachable!(), + }; + let name = format!("{scalar_name}_{device_name}"); + let id = BenchmarkId::new("candle", name); + g.bench_function(id, |b| { + use candle_backend::LeNet5Classifier; + let mut model = LeNet5Classifier::new(device.clone(), dtype) + .unwrap() + .with_sgd(false) + .unwrap(); + b.iter(|| { + model.train(train_batch_size).unwrap(); + }); + }); + } + } + } + } { // inference let infer_batch_size = 1000; @@ -113,18 +165,9 @@ pub fn criterion_benchmark(c: &mut Criterion) { "LeNet5(inference, batch_size = {})", infer_batch_size.to_formatted_string(&Locale::en) )); - { let scalar_types = [ScalarType::BF16, ScalarType::F32]; - let devices = if cfg!(feature = "device") { - vec![ - Device::host(), - Device::builder().index(device_index).build().unwrap(), - ] - } else { - vec![Device::host()] - }; - for device in devices { + for device in autograph_devices(device_index) { let device_name = if device.is_device() { "device" } else { "host" }; for scalar_type in scalar_types { let scalar_name = scalar_type.name(); @@ -140,18 +183,12 @@ pub fn criterion_benchmark(c: &mut Criterion) { } } } - #[cfg(feature = "tch")] { - use tch::{kind::Kind, Device}; + use tch::kind::Kind; let kinds = [Kind::BFloat16, Kind::Float]; - let devices = if cfg!(feature = "cuda") { - vec![Device::Cpu, Device::Cuda(cuda_device_index)] - } else { - vec![Device::Cpu] - }; - for device in devices { + for device in tch_devices(cuda_device_index) { let device_name = if device.is_cuda() { "device" } else { "host" }; for kind in kinds { let kind_name = match kind { @@ -171,6 +208,34 @@ pub fn criterion_benchmark(c: &mut Criterion) { } } } + #[cfg(feature = "candle")] + { + use candle_core::DType; + + let dtypes = [/* Not Supported DType::BF16,*/ DType::F32]; + for device in candle_devices(cuda_device_index) { + let device_name = if device.is_cuda() { "device" } else { "host" }; + for dtype in dtypes { + let scalar_name = match dtype { + //DType::BF16 => "bf16", + DType::F32 => "f32", + _ => unreachable!(), + }; + let name = format!("{scalar_name}_{device_name}"); + let id = BenchmarkId::new("candle", name); + g.bench_function(id, |b| { + use candle_backend::LeNet5Classifier; + let model = LeNet5Classifier::new(device.clone(), dtype) + .unwrap() + .with_sgd(false) + .unwrap(); + b.iter(|| { + model.infer(infer_batch_size).unwrap(); + }); + }); + } + } + } } if cfg!(all(feature = "device", feature = "tch")) { eprintln!("warning: sig abort in torch on exit when vulkan is used"); diff --git a/benches/neural-network-benches/src/autograph_backend.rs b/benches/neural-network-benches/src/autograph_backend.rs index e6b38149..d3130330 100644 --- a/benches/neural-network-benches/src/autograph_backend.rs +++ b/benches/neural-network-benches/src/autograph_backend.rs @@ -1,6 +1,6 @@ +use crate::half::bf16; use anyhow::Result; use autograph::{ - half::bf16, krnl::{device::Device, scalar::ScalarType}, learn::{ criterion::CrossEntropyLoss, diff --git a/benches/neural-network-benches/src/candle_backend.rs b/benches/neural-network-benches/src/candle_backend.rs new file mode 100644 index 00000000..66d685cc --- /dev/null +++ b/benches/neural-network-benches/src/candle_backend.rs @@ -0,0 +1,101 @@ +use anyhow::Result; +use candle_core::{DType, Device, Tensor, Var}; +use candle_nn::{ + conv2d_no_bias, linear, linear_no_bias, loss::cross_entropy, Conv2d, Conv2dConfig, Linear, + Module, Optimizer, VarBuilder, VarMap, SGD, +}; + +pub struct LeNet5Classifier { + device: Device, + dtype: DType, + model: LeNet5, + optimizer: Option, + varmap: VarMap, + _var_builder: VarBuilder<'static>, +} + +impl LeNet5Classifier { + pub fn new(device: Device, dtype: DType) -> Result { + let varmap = VarMap::new(); + let var_builder = VarBuilder::from_varmap(&varmap, dtype, &device); + let model = LeNet5::new(&var_builder)?; + Ok(Self { + device, + dtype, + model, + optimizer: None, + varmap, + _var_builder: var_builder, + }) + } + pub fn with_sgd(self, momentum: bool) -> Result { + if momentum { + anyhow::bail!("Momentum not supported by candle!"); + } + /* + let momentum = if momentum { 0.01 } else { 0.0 }; + */ + let learning_rate = 0.01; + let optimizer = SGD::new(self.varmap.all_vars(), learning_rate)?; + Ok(Self { + optimizer: Some(optimizer), + ..self + }) + } + pub fn infer(&self, batch_size: usize) -> Result<()> { + let x = Tensor::zeros((batch_size, 1, 28, 28), self.dtype, &self.device)?; + let _y = self.model.forward(&x)?; + Ok(()) + } + pub fn train(&mut self, batch_size: usize) -> Result<()> { + let x = Var::zeros((batch_size, 1, 28, 28), self.dtype, &self.device)?; + let t = Tensor::zeros(batch_size, DType::U32, &self.device)?; + let y = self.model.forward(&x)?; + let loss = cross_entropy(&y, &t)?; + self.optimizer.as_mut().unwrap().backward_step(&loss)?; + Ok(()) + } +} + +#[derive(Debug)] +struct LeNet5 { + conv1: Conv2d, + conv2: Conv2d, + dense1: Linear, + dense2: Linear, + dense3: Linear, +} + +impl LeNet5 { + fn new(var_builder: &VarBuilder) -> Result { + let conv1 = conv2d_no_bias(1, 6, 5, Conv2dConfig::default(), var_builder.pp("conv1"))?; + let conv2 = conv2d_no_bias(6, 16, 5, Conv2dConfig::default(), var_builder.pp("conv2"))?; + let dense1 = linear_no_bias(16 * 4 * 4, 128, var_builder.pp("dense1"))?; + let dense2 = linear_no_bias(128, 84, var_builder.pp("dense2"))?; + let dense3 = linear(84, 10, var_builder.pp("dense3"))?; + Ok(Self { + conv1, + conv2, + dense1, + dense2, + dense3, + }) + } +} + +impl Module for LeNet5 { + fn forward(&self, xs: &Tensor) -> Result { + let Self { + conv1, + conv2, + dense1, + dense2, + dense3, + } = self; + let x = conv1.forward(xs)?.relu()?.max_pool2d(2)?; + let x = conv2.forward(&x)?.relu()?.max_pool2d(2)?.flatten_from(1)?; + let x = dense1.forward(&x)?.relu()?; + let x = dense2.forward(&x)?.relu()?; + dense3.forward(&x) + } +} diff --git a/benches/neural-network-benches/src/lib.rs b/benches/neural-network-benches/src/lib.rs index d7e0e587..c21b673e 100644 --- a/benches/neural-network-benches/src/lib.rs +++ b/benches/neural-network-benches/src/lib.rs @@ -1,3 +1,6 @@ +use autograph::half; pub mod autograph_backend; +#[cfg(feature = "candle")] +pub mod candle_backend; #[cfg(feature = "tch")] pub mod tch_backend; diff --git a/benches/neural-network-benches/src/tch_backend.rs b/benches/neural-network-benches/src/tch_backend.rs index 3dfb9bd9..15c9a73b 100644 --- a/benches/neural-network-benches/src/tch_backend.rs +++ b/benches/neural-network-benches/src/tch_backend.rs @@ -1,5 +1,5 @@ +use crate::half::bf16; use anyhow::Result; -use autograph::half::bf16; use tch::{ kind::Kind, nn::{