From 82464166e4d947a717509922a566e7ceaf4b3f2f Mon Sep 17 00:00:00 2001
From: Nicolas Patry <patry.nicolas@protonmail.com>
Date: Fri, 28 Jul 2023 12:07:39 +0200
Subject: [PATCH 01/10] 3rd phase.

---
 candle-book/src/SUMMARY.md                 | 10 +--
 candle-book/src/cuda/README.md             |  1 +
 candle-book/src/cuda/porting.md            |  1 +
 candle-book/src/cuda/writing.md            |  1 +
 candle-book/src/error_manage.md            | 38 +++++++++++
 candle-book/src/inference/README.md        |  6 ++
 candle-book/src/inference/hub.md           | 79 ++++++++++++++++++++++
 candle-book/src/inference/serialization.md |  2 +
 candle-book/src/training/serialization.md  |  1 +
 9 files changed, 134 insertions(+), 5 deletions(-)
 create mode 100644 candle-book/src/cuda/README.md
 create mode 100644 candle-book/src/cuda/porting.md
 create mode 100644 candle-book/src/cuda/writing.md
 create mode 100644 candle-book/src/training/serialization.md

diff --git a/candle-book/src/SUMMARY.md b/candle-book/src/SUMMARY.md
index ddd6e91655..e35a865f6f 100644
--- a/candle-book/src/SUMMARY.md
+++ b/candle-book/src/SUMMARY.md
@@ -12,11 +12,11 @@
 
 - [Running a model](inference/README.md)
     - [Using the hub](inference/hub.md)
-    - [Serialization](inference/serialization.md)
-    - [Advanced Cuda usage](inference/cuda/README.md)
-        - [Writing a custom kernel](inference/cuda/writing.md)
-        - [Porting a custom kernel](inference/cuda/porting.md)
 - [Error management](error_manage.md)
+- [Advanced Cuda usage](cuda/README.md)
+    - [Writing a custom kernel](cuda/writing.md)
+    - [Porting a custom kernel](cuda/porting.md)
+- [Using MKL](advanced/mkl.md)
 - [Creating apps](apps/README.md)
     - [Creating a WASM app](apps/wasm.md)
     - [Creating a REST api webserver](apps/rest.md)
@@ -24,4 +24,4 @@
 - [Training](training/README.md)
     - [MNIST](training/mnist.md)
     - [Fine-tuning](training/finetuning.md)
-- [Using MKL](advanced/mkl.md)
+    - [Serialization](training/serialization.md)
diff --git a/candle-book/src/cuda/README.md b/candle-book/src/cuda/README.md
new file mode 100644
index 0000000000..68434cbfe2
--- /dev/null
+++ b/candle-book/src/cuda/README.md
@@ -0,0 +1 @@
+# Advanced Cuda usage
diff --git a/candle-book/src/cuda/porting.md b/candle-book/src/cuda/porting.md
new file mode 100644
index 0000000000..e332146d7e
--- /dev/null
+++ b/candle-book/src/cuda/porting.md
@@ -0,0 +1 @@
+# Porting a custom kernel
diff --git a/candle-book/src/cuda/writing.md b/candle-book/src/cuda/writing.md
new file mode 100644
index 0000000000..0fe1f3dc7f
--- /dev/null
+++ b/candle-book/src/cuda/writing.md
@@ -0,0 +1 @@
+# Writing a custom kernel
diff --git a/candle-book/src/error_manage.md b/candle-book/src/error_manage.md
index 042e191f49..af7593d68d 100644
--- a/candle-book/src/error_manage.md
+++ b/candle-book/src/error_manage.md
@@ -1 +1,39 @@
 # Error management
+
+You might have seen in the code base a lot of `.unwrap()` or `?`.
+If you're unfamiliar with Rust check out the [Rust book](https://doc.rust-lang.org/book/ch09-02-recoverable-errors-with-result.html)
+for more information.
+
+What's important to know though, is that if you want to know *where* a particular operation failed
+You can simply use `RUST_BACKTRACE=1` to get the location of where the model actually failed.
+
+Let's see on failing code:
+
+```rust,ignore
+let x = Tensor::zeros((1, 784), DType::F32, &device)?;
+let y = Tensor::zeros((1, 784), DType::F32, &device)?;
+let z = x.matmul(&y)?;
+```
+
+Will print at runtime:
+
+```bash
+Error: ShapeMismatchBinaryOp { lhs: [1, 784], rhs: [1, 784], op: "matmul" }
+``` 
+
+
+After adding `RUST_BACKTRACE=1`:
+
+
+```bash
+Error: WithBacktrace { inner: ShapeMismatchBinaryOp { lhs: [1, 784], rhs: [1, 784], op: "matmul" }, backtrace: Backtrace [{ fn: "candle::error::Error::bt", file: "/home/nicolas/.cargo/git/checkouts/candle-5bb8ef7e0626d693/f291065/candle-core/src/error.rs", line: 200 }, { fn: "candle::tensor::Tensor::matmul", file: "/home/nicolas/.cargo/git/checkouts/candle-5bb8ef7e0626d693/f291065/candle-core/src/tensor.rs", line: 816 }, { fn: "myapp::main", file: "./src/main.rs", line: 29 }, { fn: "core::ops::function::FnOnce::call_once", file: "/rustc/8ede3aae28fe6e4d52b38157d7bfe0d3bceef225/library/core/src/ops/function.rs", line: 250 }, { fn: "std::sys_common::backtrace::__rust_begin_short_backtrace", file: "/rustc/8ede3aae28fe6e4d52b38157d7bfe0d3bceef225/library/std/src/sys_common/backtrace.rs", line: 135 }, { fn: "std::rt::lang_start::{{closure}}", file: "/rustc/8ede3aae28fe6e4d52b38157d7bfe0d3bceef225/library/std/src/rt.rs", line: 166 }, { fn: "core::ops::function::impls::<impl core::ops::function::FnOnce<A> for &F>::call_once", file: "/rustc/8ede3aae28fe6e4d52b38157d7bfe0d3bceef225/library/core/src/ops/function.rs", line: 284 }, { fn: "std::panicking::try::do_call", file: "/rustc/8ede3aae28fe6e4d52b38157d7bfe0d3bceef225/library/std/src/panicking.rs", line: 500 }, { fn: "std::panicking::try", file: "/rustc/8ede3aae28fe6e4d52b38157d7bfe0d3bceef225/library/std/src/panicking.rs", line: 464 }, { fn: "std::panic::catch_unwind", file: "/rustc/8ede3aae28fe6e4d52b38157d7bfe0d3bceef225/library/std/src/panic.rs", line: 142 }, { fn: "std::rt::lang_start_internal::{{closure}}", file: "/rustc/8ede3aae28fe6e4d52b38157d7bfe0d3bceef225/library/std/src/rt.rs", line: 148 }, { fn: "std::panicking::try::do_call", file: "/rustc/8ede3aae28fe6e4d52b38157d7bfe0d3bceef225/library/std/src/panicking.rs", line: 500 }, { fn: "std::panicking::try", file: "/rustc/8ede3aae28fe6e4d52b38157d7bfe0d3bceef225/library/std/src/panicking.rs", line: 464 }, { fn: "std::panic::catch_unwind", file: "/rustc/8ede3aae28fe6e4d52b38157d7bfe0d3bceef225/library/std/src/panic.rs", line: 142 }, { fn: "std::rt::lang_start_internal", file: "/rustc/8ede3aae28fe6e4d52b38157d7bfe0d3bceef225/library/std/src/rt.rs", line: 148 }, { fn: "std::rt::lang_start", file: "/rustc/8ede3aae28fe6e4d52b38157d7bfe0d3bceef225/library/std/src/rt.rs", line: 165 }, { fn: "main" }, { fn: "__libc_start_main" }, { fn: "_start" }] }
+```
+
+Not super pretty at the moment, but we can see error occured on `{ fn: "myapp::main", file: "./src/main.rs", line: 29 }`
+
+
+Another thing to note, is that since Rust is compiled it is not necessarily as easy to recover proper stacktraces
+especially in release builds. We're using [`anyhow`](https://docs.rs/anyhow/latest/anyhow/) for that.
+The library is still young, please [report](https://github.com/LaurentMazare/candle/issues) any issues detecting where an error is coming from.
+
+
diff --git a/candle-book/src/inference/README.md b/candle-book/src/inference/README.md
index c82f85e18b..1b75a31039 100644
--- a/candle-book/src/inference/README.md
+++ b/candle-book/src/inference/README.md
@@ -1 +1,7 @@
 # Running a model
+
+
+In order to run an existing model, you will need to download and use existing weights.
+Most models are already available on https://huggingface.co/ in [`safetensors`](https://github.com/huggingface/safetensors) format.
+
+Let's get started by running an old model : `bert-base-uncased`.
diff --git a/candle-book/src/inference/hub.md b/candle-book/src/inference/hub.md
index 6242c07015..8cf375d773 100644
--- a/candle-book/src/inference/hub.md
+++ b/candle-book/src/inference/hub.md
@@ -1 +1,80 @@
 # Using the hub
+
+Install the [`hf-hub`](https://github.com/huggingface/hf-hub) crate:
+
+```bash
+cargo add hf-hub
+```
+
+Then let's start by downloading the [model file](https://huggingface.co/bert-base-uncased/tree/main).
+
+
+```rust
+# extern crate candle;
+# extern crate hf_hub;
+use hf_hub::api::sync::Api;
+use candle::Device;
+
+let api = Api::new().unwrap();
+let repo = api.model("bert-base-uncased".to_string());
+
+let weights = repo.get("model.safetensors").unwrap();
+
+let weights = candle::safetensors::load(weights, &Device::Cpu);
+```
+
+We now have access to all the [tensors](https://huggingface.co/bert-base-uncased?show_tensors=true) within the file.
+
+
+## Using async 
+
+`hf-hub` comes with an async API.
+
+```bash
+cargo add hf-hub --features tokio
+```
+
+```rust,ignore
+# extern crate candle;
+# extern crate hf_hub;
+use hf_hub::api::tokio::Api;
+use candle::Device;
+
+let api = Api::new().unwrap();
+let repo = api.model("bert-base-uncased".to_string());
+
+let weights = repo.get("model.safetensors").await.unwrap();
+
+let weights = candle::safetensors::load(weights, &Device::Cpu);
+```
+
+
+## Using in a real model.
+
+Now that we have our weights, we can use them in our bert architecture:
+
+```rust
+# extern crate candle;
+# extern crate candle_nn;
+# extern crate hf_hub;
+# use hf_hub::api::sync::Api;
+# use candle::Device;
+# 
+# let api = Api::new().unwrap();
+# let repo = api.model("bert-base-uncased".to_string());
+# 
+# let weights = repo.get("model.safetensors").unwrap();
+use candle_nn::Linear;
+
+let weights = candle::safetensors::load(weights, &Device::Cpu);
+
+let weight = weights.get("bert.encoder.layer.0.attention.self.query.weight").unwrap();
+let bias = weights.get("bert.encoder.layer.0.attention.self.query.bias").unwrap();
+
+let linear = Linear::new(weight, Some(bias));
+
+let input_ids = Tensor::zeros((3, 7680), DType::F32, &Device::Cpu).unwrap();
+let output = linear.forward(&input_ids);
+```
+
+For a full reference, you can check out the full [bert](https://github.com/LaurentMazare/candle/tree/main/candle-examples/examples/bert) example.
diff --git a/candle-book/src/inference/serialization.md b/candle-book/src/inference/serialization.md
index 0dfc62d35b..133ff02513 100644
--- a/candle-book/src/inference/serialization.md
+++ b/candle-book/src/inference/serialization.md
@@ -1 +1,3 @@
 # Serialization
+
+Once you have a r
diff --git a/candle-book/src/training/serialization.md b/candle-book/src/training/serialization.md
new file mode 100644
index 0000000000..0dfc62d35b
--- /dev/null
+++ b/candle-book/src/training/serialization.md
@@ -0,0 +1 @@
+# Serialization

From 45642a8530fdfbd64fcac118aed59b7cb7dfaf45 Mon Sep 17 00:00:00 2001
From: Nicolas Patry <patry.nicolas@protonmail.com>
Date: Tue, 1 Aug 2023 15:04:41 +0200
Subject: [PATCH 02/10] Fixing examples.

---
 candle-book/src/inference/hub.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/candle-book/src/inference/hub.md b/candle-book/src/inference/hub.md
index 8cf375d773..de514322eb 100644
--- a/candle-book/src/inference/hub.md
+++ b/candle-book/src/inference/hub.md
@@ -58,20 +58,20 @@ Now that we have our weights, we can use them in our bert architecture:
 # extern crate candle_nn;
 # extern crate hf_hub;
 # use hf_hub::api::sync::Api;
-# use candle::Device;
 # 
 # let api = Api::new().unwrap();
 # let repo = api.model("bert-base-uncased".to_string());
 # 
 # let weights = repo.get("model.safetensors").unwrap();
+use candle::{Device, Tensor, DType};
 use candle_nn::Linear;
 
-let weights = candle::safetensors::load(weights, &Device::Cpu);
+let weights = candle::safetensors::load(weights, &Device::Cpu).unwrap();
 
 let weight = weights.get("bert.encoder.layer.0.attention.self.query.weight").unwrap();
 let bias = weights.get("bert.encoder.layer.0.attention.self.query.bias").unwrap();
 
-let linear = Linear::new(weight, Some(bias));
+let linear = Linear::new(weight.clone(), Some(bias.clone()));
 
 let input_ids = Tensor::zeros((3, 7680), DType::F32, &Device::Cpu).unwrap();
 let output = linear.forward(&input_ids);

From a44471a305f2bc768c4f0dd0e7d23a7cfe3cb408 Mon Sep 17 00:00:00 2001
From: Nicolas Patry <patry.nicolas@protonmail.com>
Date: Tue, 1 Aug 2023 16:36:53 +0200
Subject: [PATCH 03/10] Adding more details on how to load things.

- Loading with memmap
- Loading a sharded tensor
- Moved some snippets to `candle-examples/src/lib.rs` This is because
managing book specific dependencies is a pain https://github.com/rust-lang/mdBook/issues/706
- This causes a non aligned inclusion  https://github.com/rust-lang/mdBook/pull/1856 which we have
to ignore fmt to remove.

mdbook might need some more love :)
---
 candle-book/src/inference/hub.md | 46 +++++++++++----
 candle-core/src/safetensors.rs   |  6 +-
 candle-examples/Cargo.toml       |  4 ++
 candle-examples/src/lib.rs       | 99 ++++++++++++++++++++++++++++++++
 4 files changed, 143 insertions(+), 12 deletions(-)

diff --git a/candle-book/src/inference/hub.md b/candle-book/src/inference/hub.md
index de514322eb..01492df198 100644
--- a/candle-book/src/inference/hub.md
+++ b/candle-book/src/inference/hub.md
@@ -25,6 +25,8 @@ let weights = candle::safetensors::load(weights, &Device::Cpu);
 
 We now have access to all the [tensors](https://huggingface.co/bert-base-uncased?show_tensors=true) within the file.
 
+You can check all the names of the tensors [here](https://huggingface.co/bert-base-uncased?show_tensors=true)
+
 
 ## Using async 
 
@@ -35,17 +37,9 @@ cargo add hf-hub --features tokio
 ```
 
 ```rust,ignore
-# extern crate candle;
-# extern crate hf_hub;
-use hf_hub::api::tokio::Api;
-use candle::Device;
-
-let api = Api::new().unwrap();
-let repo = api.model("bert-base-uncased".to_string());
-
-let weights = repo.get("model.safetensors").await.unwrap();
-
-let weights = candle::safetensors::load(weights, &Device::Cpu);
+# This is tested directly in examples crate because it needs external dependencies unfortunately:
+# See [this](https://github.com/rust-lang/mdBook/issues/706)
+{{#include ../../../candle-examples/src/lib.rs:book_hub_1}}
 ```
 
 
@@ -78,3 +72,33 @@ let output = linear.forward(&input_ids);
 ```
 
 For a full reference, you can check out the full [bert](https://github.com/LaurentMazare/candle/tree/main/candle-examples/examples/bert) example.
+
+## Memory mapping
+
+For more efficient loading, instead of reading the file, you could use [`memmap2`](https://docs.rs/memmap2/latest/memmap2/)
+
+**Note**: Be careful about memory mapping it seems to cause issues on [Windows, WSL](https://github.com/AUTOMATIC1111/stable-diffusion-webui/issues/5893)
+and will definitely be slower on network mounted disk, because it will issue more read calls.
+
+```rust,ignore
+{{#include ../../../candle-examples/src/lib.rs:book_hub_2}}
+```
+
+**Note**: This operation is **unsafe**. [See the safety notice](https://docs.rs/memmap2/latest/memmap2/struct.Mmap.html#safety).
+In practice model files should never be modified, and the mmaps should be mostly READONLY anyway, so the caveat most likely does not apply, but always keep it in mind.
+
+
+## Tensor Parallel Sharding
+
+When using multiple GPUs to use in Tensor Parallel in order to get good latency, you can load only the part of the Tensor you need.
+
+For that you need to use [`safetensors`](https://crates.io/crates/safetensors) directly.
+
+```bash
+cargo add safetensors
+```
+
+
+```rust,ignore
+{{#include ../../../candle-examples/src/lib.rs:book_hub_3}}
+```
diff --git a/candle-core/src/safetensors.rs b/candle-core/src/safetensors.rs
index 1880a0411d..132fb914e5 100644
--- a/candle-core/src/safetensors.rs
+++ b/candle-core/src/safetensors.rs
@@ -242,7 +242,11 @@ fn convert_back(tensor: &Tensor) -> Result<Vec<u8>> {
 
 pub fn load<P: AsRef<Path>>(filename: P, device: &Device) -> Result<HashMap<String, Tensor>> {
     let data = std::fs::read(filename.as_ref())?;
-    let st = safetensors::SafeTensors::deserialize(&data)?;
+    load_buffer(&data[..], device)
+}
+
+pub fn load_buffer(data: &[u8], device: &Device) -> Result<HashMap<String, Tensor>> {
+    let st = safetensors::SafeTensors::deserialize(data)?;
     st.tensors()
         .into_iter()
         .map(|(name, view)| Ok((name, view.load(device)?)))
diff --git a/candle-examples/Cargo.toml b/candle-examples/Cargo.toml
index 0db960ca75..d4544ef7d0 100644
--- a/candle-examples/Cargo.toml
+++ b/candle-examples/Cargo.toml
@@ -25,6 +25,7 @@ half = { workspace = true, optional = true }
 [dev-dependencies]
 anyhow = { workspace = true }
 byteorder = { workspace = true }
+hf-hub = { workspace = true, features=["tokio"]}
 clap = { workspace = true }
 hf-hub = { workspace = true }
 memmap2 = { workspace = true }
@@ -34,6 +35,9 @@ tracing = { workspace = true }
 tracing-chrome = { workspace = true }
 tracing-subscriber = { workspace = true }
 wav = { workspace = true }
+# Necessary to disambiguate with tokio in wasm examples which are 1.28.1
+tokio = "1.29.1"
+memmap2.workspace = true
 
 [build-dependencies]
 anyhow = { workspace = true }
diff --git a/candle-examples/src/lib.rs b/candle-examples/src/lib.rs
index 285aee049d..3410026ee8 100644
--- a/candle-examples/src/lib.rs
+++ b/candle-examples/src/lib.rs
@@ -11,3 +11,102 @@ pub fn device(cpu: bool) -> Result<Device> {
         Ok(device)
     }
 }
+
+#[cfg(test)]
+mod tests {
+    // NOTE: Waiting on https://github.com/rust-lang/mdBook/pull/1856
+    #[rustfmt::skip]
+    #[tokio::test]
+    async fn book_hub_1() {
+// ANCHOR: book_hub_1
+use candle::Device;
+use hf_hub::api::tokio::Api;
+
+let api = Api::new().unwrap();
+let repo = api.model("bert-base-uncased".to_string());
+
+let weights_filename = repo.get("model.safetensors").await.unwrap();
+
+let weights = candle::safetensors::load(weights_filename, &Device::Cpu).unwrap();
+// ANCHOR_END: book_hub_1
+        assert_eq!(weights.len(), 206);
+    }
+
+    #[rustfmt::skip]
+    #[test]
+    fn book_hub_2() {
+// ANCHOR: book_hub_2
+use candle::Device;
+use hf_hub::api::sync::Api;
+use memmap2::Mmap;
+use std::fs;
+
+let api = Api::new().unwrap();
+let repo = api.model("bert-base-uncased".to_string());
+let weights_filename = repo.get("model.safetensors").unwrap();
+
+let file = fs::File::open(weights_filename).unwrap();
+let mmap = unsafe { Mmap::map(&file).unwrap() };
+let weights = candle::safetensors::load_buffer(&mmap[..], &Device::Cpu).unwrap();
+// ANCHOR_END: book_hub_2
+        assert_eq!(weights.len(), 206);
+    }
+
+    #[rustfmt::skip]
+    #[test]
+    fn book_hub_3() {
+// ANCHOR: book_hub_3
+use candle::{DType, Device, Tensor};
+use hf_hub::api::sync::Api;
+use memmap2::Mmap;
+use safetensors::slice::IndexOp;
+use safetensors::SafeTensors;
+use std::fs;
+
+let api = Api::new().unwrap();
+let repo = api.model("bert-base-uncased".to_string());
+let weights_filename = repo.get("model.safetensors").unwrap();
+
+let file = fs::File::open(weights_filename).unwrap();
+let mmap = unsafe { Mmap::map(&file).unwrap() };
+
+// Use safetensors directly
+let tensors = SafeTensors::deserialize(&mmap[..]).unwrap();
+let view = tensors
+.tensor("bert.encoder.layer.0.attention.self.query.weight")
+.unwrap();
+
+// We're going to load shard with rank 1, within a world_size of 4
+// We're going to split along dimension 0 doing VIEW[start..stop, :]
+let rank = 1;
+let world_size = 4;
+let dim = 0;
+let dtype = view.dtype();
+let mut tp_shape = view.shape().to_vec();
+let size = tp_shape[0];
+
+if size % world_size != 0 {
+panic!("The dimension is not divisble by `world_size`");
+}
+let block_size = size / world_size;
+let start = rank * block_size;
+let stop = (rank + 1) * block_size;
+
+// Everything is expressed in tensor dimension
+// bytes offsets is handled automatically for safetensors.
+
+let iterator = view.slice(start..stop).unwrap();
+
+tp_shape[dim] = block_size;
+
+// Convert safetensors Dtype to candle DType
+let dtype: DType = dtype.try_into().unwrap();
+
+// TODO: Implement from_buffer_iterator to we can skip the extra CPU alloc.
+let raw: Vec<u8> = iterator.into_iter().flatten().cloned().collect();
+let tp_tensor = Tensor::from_raw_buffer(&raw, dtype, &tp_shape, &Device::Cpu).unwrap();
+// ANCHOR_END: book_hub_3
+        assert_eq!(view.shape(), &[768, 768]);
+        assert_eq!(tp_tensor.dims(), &[192, 768]);
+    }
+}

From a70b95f9e7f7e5aa66e647b51cb2849228077a47 Mon Sep 17 00:00:00 2001
From: Nicolas Patry <patry.nicolas@protonmail.com>
Date: Tue, 1 Aug 2023 16:49:35 +0200
Subject: [PATCH 04/10] Marking unwritten chapters as Draft (disables the
 link).

---
 candle-book/src/SUMMARY.md | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/candle-book/src/SUMMARY.md b/candle-book/src/SUMMARY.md
index e35a865f6f..3432f66f1d 100644
--- a/candle-book/src/SUMMARY.md
+++ b/candle-book/src/SUMMARY.md
@@ -12,16 +12,16 @@
 
 - [Running a model](inference/README.md)
     - [Using the hub](inference/hub.md)
-- [Error management](error_manage.md)
-- [Advanced Cuda usage](cuda/README.md)
-    - [Writing a custom kernel](cuda/writing.md)
-    - [Porting a custom kernel](cuda/porting.md)
-- [Using MKL](advanced/mkl.md)
-- [Creating apps](apps/README.md)
-    - [Creating a WASM app](apps/wasm.md)
-    - [Creating a REST api webserver](apps/rest.md)
-    - [Creating a desktop Tauri app](apps/dekstop.md)
-- [Training](training/README.md)
-    - [MNIST](training/mnist.md)
-    - [Fine-tuning](training/finetuning.md)
-    - [Serialization](training/serialization.md)
+- [Error management]()
+- [Advanced Cuda usage]()
+    - [Writing a custom kernel]()
+    - [Porting a custom kernel]()
+- [Using MKL]()
+- [Creating apps]()
+    - [Creating a WASM app]()
+    - [Creating a REST api webserver]()
+    - [Creating a desktop Tauri app]()
+- [Training]()
+    - [MNIST]()
+    - [Fine-tuning]()
+    - [Serialization]()

From 1b705a426f6196536fc6b587f855113dc26bc3ab Mon Sep 17 00:00:00 2001
From: Nicolas Patry <patry.nicolas@protonmail.com>
Date: Wed, 2 Aug 2023 09:21:44 +0200
Subject: [PATCH 05/10] Remove duplicate.

---
 candle-examples/Cargo.toml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/candle-examples/Cargo.toml b/candle-examples/Cargo.toml
index d4544ef7d0..8779b9a5ca 100644
--- a/candle-examples/Cargo.toml
+++ b/candle-examples/Cargo.toml
@@ -27,7 +27,6 @@ anyhow = { workspace = true }
 byteorder = { workspace = true }
 hf-hub = { workspace = true, features=["tokio"]}
 clap = { workspace = true }
-hf-hub = { workspace = true }
 memmap2 = { workspace = true }
 rand = { workspace = true }
 tokenizers = { workspace = true, features = ["onig"] }

From c11e78b33454b976ad97b1534cc06eb027356865 Mon Sep 17 00:00:00 2001
From: Nicolas Patry <patry.nicolas@protonmail.com>
Date: Wed, 2 Aug 2023 09:22:27 +0200
Subject: [PATCH 06/10] Odd rebase artifact.

---
 candle-examples/Cargo.toml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/candle-examples/Cargo.toml b/candle-examples/Cargo.toml
index 8779b9a5ca..c4e3465626 100644
--- a/candle-examples/Cargo.toml
+++ b/candle-examples/Cargo.toml
@@ -36,7 +36,6 @@ tracing-subscriber = { workspace = true }
 wav = { workspace = true }
 # Necessary to disambiguate with tokio in wasm examples which are 1.28.1
 tokio = "1.29.1"
-memmap2.workspace = true
 
 [build-dependencies]
 anyhow = { workspace = true }

From ae68635af9dfcae359f621dd3e1df3b3c3d97042 Mon Sep 17 00:00:00 2001
From: Nicolas Patry <patry.nicolas@protonmail.com>
Date: Wed, 2 Aug 2023 18:16:50 +0200
Subject: [PATCH 07/10] Add small error management.

---
 candle-book/src/error_manage.md | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/candle-book/src/error_manage.md b/candle-book/src/error_manage.md
index af7593d68d..c1a16bd9da 100644
--- a/candle-book/src/error_manage.md
+++ b/candle-book/src/error_manage.md
@@ -36,4 +36,16 @@ Another thing to note, is that since Rust is compiled it is not necessarily as e
 especially in release builds. We're using [`anyhow`](https://docs.rs/anyhow/latest/anyhow/) for that.
 The library is still young, please [report](https://github.com/LaurentMazare/candle/issues) any issues detecting where an error is coming from.
 
+## Cuda error management
+
+When running a model on Cuda, you might get a stacktrace not really representing the error.
+The reason is that CUDA is async by nature, and therefore the error might be caught while you were sending totally different kernels.
+
+One way to avoid this is to use `CUDA_LAUNCH_BLOCKING=1` as an environment variable. This will force every kernel to be launched sequentially.
+You might still however see the error happening on other kernels as the faulty kernel might exit without an error but spoiling some pointer for which the error will happen when dropping the `CudaSlice` only.
+
+
+If this occurs, you can use [`compute-sanitizer`](https://docs.nvidia.com/compute-sanitizer/ComputeSanitizer/index.html)
+This tool is like `valgrind` but for cuda. It will help locate the errors in the kernels.
+
 

From 166f4d1101437eb36c938781ed0b9270d9a1c282 Mon Sep 17 00:00:00 2001
From: Nicolas Patry <patry.nicolas@protonmail.com>
Date: Wed, 2 Aug 2023 18:35:31 +0200
Subject: [PATCH 08/10] `s/candle/candle_core/g`

---
 candle-book/src/inference/hub.md | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/candle-book/src/inference/hub.md b/candle-book/src/inference/hub.md
index 01492df198..a974a1faa8 100644
--- a/candle-book/src/inference/hub.md
+++ b/candle-book/src/inference/hub.md
@@ -10,17 +10,17 @@ Then let's start by downloading the [model file](https://huggingface.co/bert-bas
 
 
 ```rust
-# extern crate candle;
+# extern crate candle_core;
 # extern crate hf_hub;
 use hf_hub::api::sync::Api;
-use candle::Device;
+use candle_core::Device;
 
 let api = Api::new().unwrap();
 let repo = api.model("bert-base-uncased".to_string());
 
 let weights = repo.get("model.safetensors").unwrap();
 
-let weights = candle::safetensors::load(weights, &Device::Cpu);
+let weights = candle_core::safetensors::load(weights, &Device::Cpu);
 ```
 
 We now have access to all the [tensors](https://huggingface.co/bert-base-uncased?show_tensors=true) within the file.
@@ -48,7 +48,7 @@ cargo add hf-hub --features tokio
 Now that we have our weights, we can use them in our bert architecture:
 
 ```rust
-# extern crate candle;
+# extern crate candle_core;
 # extern crate candle_nn;
 # extern crate hf_hub;
 # use hf_hub::api::sync::Api;
@@ -57,10 +57,10 @@ Now that we have our weights, we can use them in our bert architecture:
 # let repo = api.model("bert-base-uncased".to_string());
 # 
 # let weights = repo.get("model.safetensors").unwrap();
-use candle::{Device, Tensor, DType};
+use candle_core::{Device, Tensor, DType};
 use candle_nn::Linear;
 
-let weights = candle::safetensors::load(weights, &Device::Cpu).unwrap();
+let weights = candle_core::safetensors::load(weights, &Device::Cpu).unwrap();
 
 let weight = weights.get("bert.encoder.layer.0.attention.self.query.weight").unwrap();
 let bias = weights.get("bert.encoder.layer.0.attention.self.query.bias").unwrap();

From 1b2b32e58d13ac96cee42562b845fcecfd3a08de Mon Sep 17 00:00:00 2001
From: Nicolas Patry <patry.nicolas@protonmail.com>
Date: Wed, 2 Aug 2023 18:59:36 +0200
Subject: [PATCH 09/10] Remove dead page.t

---
 candle-book/src/inference/serialization.md | 3 ---
 1 file changed, 3 deletions(-)
 delete mode 100644 candle-book/src/inference/serialization.md

diff --git a/candle-book/src/inference/serialization.md b/candle-book/src/inference/serialization.md
deleted file mode 100644
index 133ff02513..0000000000
--- a/candle-book/src/inference/serialization.md
+++ /dev/null
@@ -1,3 +0,0 @@
-# Serialization
-
-Once you have a r

From dba31473d40c88fed22574ba96021dc59f25f3f7 Mon Sep 17 00:00:00 2001
From: Nicolas Patry <patry.nicolas@protonmail.com>
Date: Wed, 2 Aug 2023 19:18:43 +0200
Subject: [PATCH 10/10] Typos and format and CD only when PR lands.

---
 .github/workflows/book-cd.yml    | 2 --
 candle-book/src/inference/hub.md | 4 ++--
 candle-examples/src/lib.rs       | 8 ++++----
 3 files changed, 6 insertions(+), 8 deletions(-)

diff --git a/.github/workflows/book-cd.yml b/.github/workflows/book-cd.yml
index fc693a789b..e8149e3832 100644
--- a/.github/workflows/book-cd.yml
+++ b/.github/workflows/book-cd.yml
@@ -1,7 +1,5 @@
 name: Deploy Rust book
 on:
-  # TODO put this back only when merging after this PR lands.
-  pull_request:
   push:
     branches:
       - main
diff --git a/candle-book/src/inference/hub.md b/candle-book/src/inference/hub.md
index a974a1faa8..b924b76dd8 100644
--- a/candle-book/src/inference/hub.md
+++ b/candle-book/src/inference/hub.md
@@ -67,8 +67,8 @@ let bias = weights.get("bert.encoder.layer.0.attention.self.query.bias").unwrap(
 
 let linear = Linear::new(weight.clone(), Some(bias.clone()));
 
-let input_ids = Tensor::zeros((3, 7680), DType::F32, &Device::Cpu).unwrap();
-let output = linear.forward(&input_ids);
+let input_ids = Tensor::zeros((3, 768), DType::F32, &Device::Cpu).unwrap();
+let output = linear.forward(&input_ids).unwrap();
 ```
 
 For a full reference, you can check out the full [bert](https://github.com/LaurentMazare/candle/tree/main/candle-examples/examples/bert) example.
diff --git a/candle-examples/src/lib.rs b/candle-examples/src/lib.rs
index 3410026ee8..2b6009b4e0 100644
--- a/candle-examples/src/lib.rs
+++ b/candle-examples/src/lib.rs
@@ -73,8 +73,8 @@ let mmap = unsafe { Mmap::map(&file).unwrap() };
 // Use safetensors directly
 let tensors = SafeTensors::deserialize(&mmap[..]).unwrap();
 let view = tensors
-.tensor("bert.encoder.layer.0.attention.self.query.weight")
-.unwrap();
+    .tensor("bert.encoder.layer.0.attention.self.query.weight")
+    .unwrap();
 
 // We're going to load shard with rank 1, within a world_size of 4
 // We're going to split along dimension 0 doing VIEW[start..stop, :]
@@ -86,7 +86,7 @@ let mut tp_shape = view.shape().to_vec();
 let size = tp_shape[0];
 
 if size % world_size != 0 {
-panic!("The dimension is not divisble by `world_size`");
+    panic!("The dimension is not divisble by `world_size`");
 }
 let block_size = size / world_size;
 let start = rank * block_size;
@@ -102,7 +102,7 @@ tp_shape[dim] = block_size;
 // Convert safetensors Dtype to candle DType
 let dtype: DType = dtype.try_into().unwrap();
 
-// TODO: Implement from_buffer_iterator to we can skip the extra CPU alloc.
+// TODO: Implement from_buffer_iterator so we can skip the extra CPU alloc.
 let raw: Vec<u8> = iterator.into_iter().flatten().cloned().collect();
 let tp_tensor = Tensor::from_raw_buffer(&raw, dtype, &tp_shape, &Device::Cpu).unwrap();
 // ANCHOR_END: book_hub_3