From cf889752a34bb059b60d96088045fcc2a4f31ddb Mon Sep 17 00:00:00 2001 From: Swordfish189 Date: Mon, 6 Jan 2025 14:04:50 +0100 Subject: [PATCH 01/13] [#5867:ConceptEntry]Pytorch:HandlingBatches(docs) --- .../handling-batches/handling-batches.md | 112 ++++++++++++++++++ 1 file changed, 112 insertions(+) create mode 100644 content/pytorch/concepts/handling-batches/handling-batches.md diff --git a/content/pytorch/concepts/handling-batches/handling-batches.md b/content/pytorch/concepts/handling-batches/handling-batches.md new file mode 100644 index 00000000000..f34616149d4 --- /dev/null +++ b/content/pytorch/concepts/handling-batches/handling-batches.md @@ -0,0 +1,112 @@ +--- +Title: 'Handling Batches' +Description: 'Learn how to efficiently process data in PyTorch using batch handling techniques.' +Subjects: + - 'Machine Learning' + - 'AI' +Tags: + - 'PyTorch' + - 'Batch Processing' + - 'DataLoader' + - 'Neural Networks' +CatalogContent: + - 'intro-to-py-torch-and-neural-networks' + - 'paths/build-a-machine-learning-model' +--- + +**Handling batches** is an essential practice in **PyTorch** for managing and processing large datasets efficiently. Batch processing groups data samples into fixed-sized subsets, enabling parallel computation, faster training, and better use of GPU resources. This technique is especially critical for deep learning workflows, where training on entire datasets at once is often computationally infeasible. + +## Syntax + +PyTorch simplifies batch handling through the `DataLoader` class. Below is the general syntax for setting up a `DataLoader`: + +```py +from torch.utils.data import DataLoader, Dataset + +# Define a custom dataset +class CustomDataset(Dataset): + def __init__(self, data, labels): + self.data = data + self.labels = labels + + def __len__(self): + return len(self.data) + + def __getitem__(self, idx): + return self.data[idx], self.labels[idx] + +# Instantiate the dataset +dataset = CustomDataset(data=[[1, 2], [3, 4], [5, 6]], labels=[0, 1, 0]) + +# Create a DataLoader for batch processing +dataloader = DataLoader(dataset, batch_size=2, shuffle=True) +``` + +### Key Parameters +- **`batch_size`**: The number of samples in each batch. +- **`shuffle`**: Randomizes the order of data each epoch, ensuring better model generalization. +- **`num_workers`** (optional): Specifies the number of subprocesses for data loading to speed up training. + +## Example + +Here is an example that demonstrates how to iterate through batches using `DataLoader`: + +```py +for batch_idx, (inputs, labels) in enumerate(dataloader): + print(f"Batch {batch_idx + 1}") + print("Inputs:", inputs) + print("Labels:", labels) +``` + +### Expected Output + +```shell +Batch 1 +Inputs: tensor([[3, 4], + [1, 2]]) +Labels: tensor([1, 0]) + +Batch 2 +Inputs: tensor([[5, 6]]) +Labels: tensor([0]) +``` + +> **Note:** The output order may vary due to `shuffle=True`. + +### Explanation + +1. **Dataset Definition**: A custom dataset is defined to hold the data and labels. +2. **DataLoader Initialization**: The dataset is passed to the `DataLoader`, along with parameters like `batch_size` and `shuffle`. +3. **Batch Iteration**: A `for` loop retrieves each batch, containing input data and corresponding labels. + +## Codebyte Example + +Below is a runnable Codebyte demonstrating the use of `DataLoader` for batch processing in PyTorch: + +```codebyte/python +from torch.utils.data import DataLoader, Dataset + +# Define a custom dataset +class CustomDataset(Dataset): + def __init__(self, data, labels): + self.data = data + self.labels = labels + + def __len__(self): + return len(self.data) + + def __getitem__(self, idx): + return self.data[idx], self.labels[idx] + +# Create dataset and DataLoader +dataset = CustomDataset(data=[[1, 2], [3, 4], [5, 6]], labels=[0, 1, 0]) +dataloader = DataLoader(dataset, batch_size=2, shuffle=True) + +# Iterate over batches +for batch_idx, (inputs, labels) in enumerate(dataloader): + print(f"Batch {batch_idx + 1}") + print("Inputs:", inputs) + print("Labels:", labels) +``` + +This example effectively showcases how PyTorch organizes data into batches and simplifies processing, making it a foundational tool for machine learning workflows. \ No newline at end of file From 4367cdc9270eb7257654946dd153391cac227f21 Mon Sep 17 00:00:00 2001 From: Swordfish189 Date: Thu, 9 Jan 2025 17:50:58 +0100 Subject: [PATCH 02/13] satisfy requested changes 1 --- .../handling-batches/handling-batches.md | 85 +++++++++---------- 1 file changed, 39 insertions(+), 46 deletions(-) diff --git a/content/pytorch/concepts/handling-batches/handling-batches.md b/content/pytorch/concepts/handling-batches/handling-batches.md index f34616149d4..5a4687e5a6b 100644 --- a/content/pytorch/concepts/handling-batches/handling-batches.md +++ b/content/pytorch/concepts/handling-batches/handling-batches.md @@ -1,6 +1,6 @@ --- Title: 'Handling Batches' -Description: 'Learn how to efficiently process data in PyTorch using batch handling techniques.' +Description: 'Batch handling is the process of organizing data into fixed-size groups for efficient computation and processing in PyTorch.' Subjects: - 'Machine Learning' - 'AI' @@ -14,7 +14,7 @@ CatalogContent: - 'paths/build-a-machine-learning-model' --- -**Handling batches** is an essential practice in **PyTorch** for managing and processing large datasets efficiently. Batch processing groups data samples into fixed-sized subsets, enabling parallel computation, faster training, and better use of GPU resources. This technique is especially critical for deep learning workflows, where training on entire datasets at once is often computationally infeasible. +**Handling batches** is an essential practice in **PyTorch** for managing and processing large datasets efficiently. PyTorch simplifies batch handling through the **DataLoader** class. Batch processing groups data samples into fixed-sized subsets, enabling parallel computation, faster training, and better use of GPU resources. This technique is especially critical for deep learning workflows, where training on entire datasets at once is often computationally infeasible. ## Syntax @@ -42,71 +42,64 @@ dataset = CustomDataset(data=[[1, 2], [3, 4], [5, 6]], labels=[0, 1, 0]) dataloader = DataLoader(dataset, batch_size=2, shuffle=True) ``` -### Key Parameters - **`batch_size`**: The number of samples in each batch. - **`shuffle`**: Randomizes the order of data each epoch, ensuring better model generalization. - **`num_workers`** (optional): Specifies the number of subprocesses for data loading to speed up training. ## Example -Here is an example that demonstrates how to iterate through batches using `DataLoader`: +Here is an example that demonstrates how to iterate through batches using DataLoader. This code defines a dataset, initializes the DataLoader, and processes the data in batches: ```py -for batch_idx, (inputs, labels) in enumerate(dataloader): +from torch.utils.data import Dataset, DataLoader + +# Define a custom dataset with temperature readings and weather conditions +class WeatherDataset(Dataset): + def __init__(self): + # Simple weather data: (temperature, is_sunny) + self.data = [ + (20, 1), (25, 1), (18, 0), (22, 1), + (17, 0), (23, 1), (19, 0), (21, 1) + ] + + def __len__(self): + return len(self.data) + + def __getitem__(self, idx): + return self.data[idx] + +# Create dataset and DataLoader +dataset = WeatherDataset() +dataloader = DataLoader(dataset, batch_size=3, shuffle=True) + +# Process batches +for batch_idx, batch in enumerate(dataloader): print(f"Batch {batch_idx + 1}") - print("Inputs:", inputs) - print("Labels:", labels) + print("Temperatures:", [temp.item() for temp in batch[0]]) + print("Weather (1=sunny, 0=cloudy):", [w.item() for w in batch[1]]) + print() ``` -### Expected Output +The output of the above code will be: ```shell Batch 1 -Inputs: tensor([[3, 4], - [1, 2]]) -Labels: tensor([1, 0]) +Temperatures: [17, 23, 21] +Weather (1=sunny, 0=cloudy): [0, 1, 1] Batch 2 -Inputs: tensor([[5, 6]]) -Labels: tensor([0]) -``` +Temperatures: [20, 25, 19] +Weather (1=sunny, 0=cloudy): [1, 1, 0] -> **Note:** The output order may vary due to `shuffle=True`. +Batch 3 +Temperatures: [18, 22] +Weather (1=sunny, 0=cloudy): [0, 1] +``` -### Explanation +> **Note:** The output order may vary, due to `shuffle=True` randomizing the order of the data. 1. **Dataset Definition**: A custom dataset is defined to hold the data and labels. 2. **DataLoader Initialization**: The dataset is passed to the `DataLoader`, along with parameters like `batch_size` and `shuffle`. 3. **Batch Iteration**: A `for` loop retrieves each batch, containing input data and corresponding labels. -## Codebyte Example - -Below is a runnable Codebyte demonstrating the use of `DataLoader` for batch processing in PyTorch: - -```codebyte/python -from torch.utils.data import DataLoader, Dataset - -# Define a custom dataset -class CustomDataset(Dataset): - def __init__(self, data, labels): - self.data = data - self.labels = labels - - def __len__(self): - return len(self.data) - - def __getitem__(self, idx): - return self.data[idx], self.labels[idx] - -# Create dataset and DataLoader -dataset = CustomDataset(data=[[1, 2], [3, 4], [5, 6]], labels=[0, 1, 0]) -dataloader = DataLoader(dataset, batch_size=2, shuffle=True) - -# Iterate over batches -for batch_idx, (inputs, labels) in enumerate(dataloader): - print(f"Batch {batch_idx + 1}") - print("Inputs:", inputs) - print("Labels:", labels) -``` - This example effectively showcases how PyTorch organizes data into batches and simplifies processing, making it a foundational tool for machine learning workflows. \ No newline at end of file From 817143172238018a610ab26eb35b77e0fafe4748 Mon Sep 17 00:00:00 2001 From: Avdhoot <50920321+avdhoottt@users.noreply.github.com> Date: Mon, 13 Jan 2025 12:21:56 +0530 Subject: [PATCH 03/13] Update content/pytorch/concepts/handling-batches/handling-batches.md --- content/pytorch/concepts/handling-batches/handling-batches.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/content/pytorch/concepts/handling-batches/handling-batches.md b/content/pytorch/concepts/handling-batches/handling-batches.md index 5a4687e5a6b..90c1f26de12 100644 --- a/content/pytorch/concepts/handling-batches/handling-batches.md +++ b/content/pytorch/concepts/handling-batches/handling-batches.md @@ -14,7 +14,8 @@ CatalogContent: - 'paths/build-a-machine-learning-model' --- -**Handling batches** is an essential practice in **PyTorch** for managing and processing large datasets efficiently. PyTorch simplifies batch handling through the **DataLoader** class. Batch processing groups data samples into fixed-sized subsets, enabling parallel computation, faster training, and better use of GPU resources. This technique is especially critical for deep learning workflows, where training on entire datasets at once is often computationally infeasible. +**Handling batches** is an essential practice in **PyTorch** for managing and processing large datasets efficiently. PyTorch simplifies batch handling through the `DataLoader` class. Batch processing groups data samples into fixed-sized subsets, enabling parallel computation, faster training, and better use of GPU resources. This technique is especially critical for deep learning workflows, where training on entire datasets at once is often computationally infeasible. + ## Syntax From c017637c1b787eef2e879e0e3b26252fa487b72e Mon Sep 17 00:00:00 2001 From: Avdhoot <50920321+avdhoottt@users.noreply.github.com> Date: Mon, 13 Jan 2025 12:22:09 +0530 Subject: [PATCH 04/13] Update content/pytorch/concepts/handling-batches/handling-batches.md --- content/pytorch/concepts/handling-batches/handling-batches.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/content/pytorch/concepts/handling-batches/handling-batches.md b/content/pytorch/concepts/handling-batches/handling-batches.md index 90c1f26de12..c88e9a7906f 100644 --- a/content/pytorch/concepts/handling-batches/handling-batches.md +++ b/content/pytorch/concepts/handling-batches/handling-batches.md @@ -19,7 +19,8 @@ CatalogContent: ## Syntax -PyTorch simplifies batch handling through the `DataLoader` class. Below is the general syntax for setting up a `DataLoader`: +Below is the general syntax for setting up a `DataLoader`: + ```py from torch.utils.data import DataLoader, Dataset From 1e17a53020b7780b30ab4c43f8792651f824fd69 Mon Sep 17 00:00:00 2001 From: Avdhoot <50920321+avdhoottt@users.noreply.github.com> Date: Mon, 13 Jan 2025 12:22:48 +0530 Subject: [PATCH 05/13] Update content/pytorch/concepts/handling-batches/handling-batches.md --- .../concepts/handling-batches/handling-batches.md | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/content/pytorch/concepts/handling-batches/handling-batches.md b/content/pytorch/concepts/handling-batches/handling-batches.md index c88e9a7906f..8534e911e04 100644 --- a/content/pytorch/concepts/handling-batches/handling-batches.md +++ b/content/pytorch/concepts/handling-batches/handling-batches.md @@ -27,21 +27,22 @@ from torch.utils.data import DataLoader, Dataset # Define a custom dataset class CustomDataset(Dataset): - def __init__(self, data, labels): - self.data = data - self.labels = labels + def __init__(self, data, labels): + self.data = data + self.labels = labels - def __len__(self): - return len(self.data) + def __len__(self): + return len(self.data) - def __getitem__(self, idx): - return self.data[idx], self.labels[idx] + def __getitem__(self, idx): + return self.data[idx], self.labels[idx] # Instantiate the dataset dataset = CustomDataset(data=[[1, 2], [3, 4], [5, 6]], labels=[0, 1, 0]) # Create a DataLoader for batch processing dataloader = DataLoader(dataset, batch_size=2, shuffle=True) + ``` - **`batch_size`**: The number of samples in each batch. From f7bd6fe74a137749db89a2eb786a8d788cd0cc14 Mon Sep 17 00:00:00 2001 From: Avdhoot <50920321+avdhoottt@users.noreply.github.com> Date: Mon, 13 Jan 2025 12:25:01 +0530 Subject: [PATCH 06/13] Update content/pytorch/concepts/handling-batches/handling-batches.md --- content/pytorch/concepts/handling-batches/handling-batches.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/content/pytorch/concepts/handling-batches/handling-batches.md b/content/pytorch/concepts/handling-batches/handling-batches.md index 8534e911e04..247eaa1fdfc 100644 --- a/content/pytorch/concepts/handling-batches/handling-batches.md +++ b/content/pytorch/concepts/handling-batches/handling-batches.md @@ -51,7 +51,8 @@ dataloader = DataLoader(dataset, batch_size=2, shuffle=True) ## Example -Here is an example that demonstrates how to iterate through batches using DataLoader. This code defines a dataset, initializes the DataLoader, and processes the data in batches: +Here is an example that demonstrates how to iterate through batches using `DataLoader`. This code defines a dataset, initializes the DataLoader, and processes the data in batches: + ```py from torch.utils.data import Dataset, DataLoader From 78b3e491b7f56d750e4fcc0de3ad5ff7f79360de Mon Sep 17 00:00:00 2001 From: Avdhoot <50920321+avdhoottt@users.noreply.github.com> Date: Mon, 13 Jan 2025 12:25:53 +0530 Subject: [PATCH 07/13] Update content/pytorch/concepts/handling-batches/handling-batches.md --- .../handling-batches/handling-batches.md | 29 ++++++++++--------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/content/pytorch/concepts/handling-batches/handling-batches.md b/content/pytorch/concepts/handling-batches/handling-batches.md index 247eaa1fdfc..d746ef21b6e 100644 --- a/content/pytorch/concepts/handling-batches/handling-batches.md +++ b/content/pytorch/concepts/handling-batches/handling-batches.md @@ -59,18 +59,18 @@ from torch.utils.data import Dataset, DataLoader # Define a custom dataset with temperature readings and weather conditions class WeatherDataset(Dataset): - def __init__(self): - # Simple weather data: (temperature, is_sunny) - self.data = [ - (20, 1), (25, 1), (18, 0), (22, 1), - (17, 0), (23, 1), (19, 0), (21, 1) - ] + def __init__(self): + # Simple weather data: (temperature, is_sunny) + self.data = [ + (20, 1), (25, 1), (18, 0), (22, 1), + (17, 0), (23, 1), (19, 0), (21, 1) + ] - def __len__(self): - return len(self.data) + def __len__(self): + return len(self.data) - def __getitem__(self, idx): - return self.data[idx] + def __getitem__(self, idx): + return self.data[idx] # Create dataset and DataLoader dataset = WeatherDataset() @@ -78,10 +78,11 @@ dataloader = DataLoader(dataset, batch_size=3, shuffle=True) # Process batches for batch_idx, batch in enumerate(dataloader): - print(f"Batch {batch_idx + 1}") - print("Temperatures:", [temp.item() for temp in batch[0]]) - print("Weather (1=sunny, 0=cloudy):", [w.item() for w in batch[1]]) - print() + print(f"Batch {batch_idx + 1}") + print("Temperatures:", [temp.item() for temp in batch[0]]) + print("Weather (1=sunny, 0=cloudy):", [w.item() for w in batch[1]]) + print() + ``` The output of the above code will be: From 03a48afa7f3b7d9959894b21d1de296455ac1708 Mon Sep 17 00:00:00 2001 From: Avdhoot <50920321+avdhoottt@users.noreply.github.com> Date: Mon, 13 Jan 2025 12:26:07 +0530 Subject: [PATCH 08/13] Update content/pytorch/concepts/handling-batches/handling-batches.md --- content/pytorch/concepts/handling-batches/handling-batches.md | 1 - 1 file changed, 1 deletion(-) diff --git a/content/pytorch/concepts/handling-batches/handling-batches.md b/content/pytorch/concepts/handling-batches/handling-batches.md index d746ef21b6e..9f12cb7b05f 100644 --- a/content/pytorch/concepts/handling-batches/handling-batches.md +++ b/content/pytorch/concepts/handling-batches/handling-batches.md @@ -82,7 +82,6 @@ for batch_idx, batch in enumerate(dataloader): print("Temperatures:", [temp.item() for temp in batch[0]]) print("Weather (1=sunny, 0=cloudy):", [w.item() for w in batch[1]]) print() - ``` The output of the above code will be: From 8d84d5cec484f58b9b98b8920abad87d58848af1 Mon Sep 17 00:00:00 2001 From: Avdhoot <50920321+avdhoottt@users.noreply.github.com> Date: Mon, 13 Jan 2025 12:26:24 +0530 Subject: [PATCH 09/13] Update content/pytorch/concepts/handling-batches/handling-batches.md --- content/pytorch/concepts/handling-batches/handling-batches.md | 1 - 1 file changed, 1 deletion(-) diff --git a/content/pytorch/concepts/handling-batches/handling-batches.md b/content/pytorch/concepts/handling-batches/handling-batches.md index 9f12cb7b05f..dcf37ac7230 100644 --- a/content/pytorch/concepts/handling-batches/handling-batches.md +++ b/content/pytorch/concepts/handling-batches/handling-batches.md @@ -42,7 +42,6 @@ dataset = CustomDataset(data=[[1, 2], [3, 4], [5, 6]], labels=[0, 1, 0]) # Create a DataLoader for batch processing dataloader = DataLoader(dataset, batch_size=2, shuffle=True) - ``` - **`batch_size`**: The number of samples in each batch. From 823dcc2c1ee6a2ee790f8dd0f442897cebbef00a Mon Sep 17 00:00:00 2001 From: Avdhoot <50920321+avdhoottt@users.noreply.github.com> Date: Mon, 13 Jan 2025 12:26:36 +0530 Subject: [PATCH 10/13] Update content/pytorch/concepts/handling-batches/handling-batches.md --- content/pytorch/concepts/handling-batches/handling-batches.md | 1 - 1 file changed, 1 deletion(-) diff --git a/content/pytorch/concepts/handling-batches/handling-batches.md b/content/pytorch/concepts/handling-batches/handling-batches.md index dcf37ac7230..cb8592ecb7c 100644 --- a/content/pytorch/concepts/handling-batches/handling-batches.md +++ b/content/pytorch/concepts/handling-batches/handling-batches.md @@ -21,7 +21,6 @@ CatalogContent: Below is the general syntax for setting up a `DataLoader`: - ```py from torch.utils.data import DataLoader, Dataset From b3842ac0ffa9416fb7e5fcc7a8163f95a574de8f Mon Sep 17 00:00:00 2001 From: Avdhoot <50920321+avdhoottt@users.noreply.github.com> Date: Mon, 13 Jan 2025 12:26:48 +0530 Subject: [PATCH 11/13] Update content/pytorch/concepts/handling-batches/handling-batches.md --- content/pytorch/concepts/handling-batches/handling-batches.md | 1 - 1 file changed, 1 deletion(-) diff --git a/content/pytorch/concepts/handling-batches/handling-batches.md b/content/pytorch/concepts/handling-batches/handling-batches.md index cb8592ecb7c..402b01c05a6 100644 --- a/content/pytorch/concepts/handling-batches/handling-batches.md +++ b/content/pytorch/concepts/handling-batches/handling-batches.md @@ -16,7 +16,6 @@ CatalogContent: **Handling batches** is an essential practice in **PyTorch** for managing and processing large datasets efficiently. PyTorch simplifies batch handling through the `DataLoader` class. Batch processing groups data samples into fixed-sized subsets, enabling parallel computation, faster training, and better use of GPU resources. This technique is especially critical for deep learning workflows, where training on entire datasets at once is often computationally infeasible. - ## Syntax Below is the general syntax for setting up a `DataLoader`: From 0e44fdc84b310d2bc33f31280220e7ba28e09581 Mon Sep 17 00:00:00 2001 From: Avdhoot <50920321+avdhoottt@users.noreply.github.com> Date: Mon, 13 Jan 2025 12:27:00 +0530 Subject: [PATCH 12/13] Update content/pytorch/concepts/handling-batches/handling-batches.md --- content/pytorch/concepts/handling-batches/handling-batches.md | 1 - 1 file changed, 1 deletion(-) diff --git a/content/pytorch/concepts/handling-batches/handling-batches.md b/content/pytorch/concepts/handling-batches/handling-batches.md index 402b01c05a6..1b68b46ee04 100644 --- a/content/pytorch/concepts/handling-batches/handling-batches.md +++ b/content/pytorch/concepts/handling-batches/handling-batches.md @@ -50,7 +50,6 @@ dataloader = DataLoader(dataset, batch_size=2, shuffle=True) Here is an example that demonstrates how to iterate through batches using `DataLoader`. This code defines a dataset, initializes the DataLoader, and processes the data in batches: - ```py from torch.utils.data import Dataset, DataLoader From ca5bf8d34fe74730fde6000f60c5b288a087cd61 Mon Sep 17 00:00:00 2001 From: Sriparno Roy Date: Sat, 18 Jan 2025 12:19:12 +0530 Subject: [PATCH 13/13] Minor changes --- .../handling-batches/handling-batches.md | 212 +++++++++--------- 1 file changed, 107 insertions(+), 105 deletions(-) diff --git a/content/pytorch/concepts/handling-batches/handling-batches.md b/content/pytorch/concepts/handling-batches/handling-batches.md index 1b68b46ee04..34550628238 100644 --- a/content/pytorch/concepts/handling-batches/handling-batches.md +++ b/content/pytorch/concepts/handling-batches/handling-batches.md @@ -1,105 +1,107 @@ ---- -Title: 'Handling Batches' -Description: 'Batch handling is the process of organizing data into fixed-size groups for efficient computation and processing in PyTorch.' -Subjects: - - 'Machine Learning' - - 'AI' -Tags: - - 'PyTorch' - - 'Batch Processing' - - 'DataLoader' - - 'Neural Networks' -CatalogContent: - - 'intro-to-py-torch-and-neural-networks' - - 'paths/build-a-machine-learning-model' ---- - -**Handling batches** is an essential practice in **PyTorch** for managing and processing large datasets efficiently. PyTorch simplifies batch handling through the `DataLoader` class. Batch processing groups data samples into fixed-sized subsets, enabling parallel computation, faster training, and better use of GPU resources. This technique is especially critical for deep learning workflows, where training on entire datasets at once is often computationally infeasible. - -## Syntax - -Below is the general syntax for setting up a `DataLoader`: - -```py -from torch.utils.data import DataLoader, Dataset - -# Define a custom dataset -class CustomDataset(Dataset): - def __init__(self, data, labels): - self.data = data - self.labels = labels - - def __len__(self): - return len(self.data) - - def __getitem__(self, idx): - return self.data[idx], self.labels[idx] - -# Instantiate the dataset -dataset = CustomDataset(data=[[1, 2], [3, 4], [5, 6]], labels=[0, 1, 0]) - -# Create a DataLoader for batch processing -dataloader = DataLoader(dataset, batch_size=2, shuffle=True) -``` - -- **`batch_size`**: The number of samples in each batch. -- **`shuffle`**: Randomizes the order of data each epoch, ensuring better model generalization. -- **`num_workers`** (optional): Specifies the number of subprocesses for data loading to speed up training. - -## Example - -Here is an example that demonstrates how to iterate through batches using `DataLoader`. This code defines a dataset, initializes the DataLoader, and processes the data in batches: - -```py -from torch.utils.data import Dataset, DataLoader - -# Define a custom dataset with temperature readings and weather conditions -class WeatherDataset(Dataset): - def __init__(self): - # Simple weather data: (temperature, is_sunny) - self.data = [ - (20, 1), (25, 1), (18, 0), (22, 1), - (17, 0), (23, 1), (19, 0), (21, 1) - ] - - def __len__(self): - return len(self.data) - - def __getitem__(self, idx): - return self.data[idx] - -# Create dataset and DataLoader -dataset = WeatherDataset() -dataloader = DataLoader(dataset, batch_size=3, shuffle=True) - -# Process batches -for batch_idx, batch in enumerate(dataloader): - print(f"Batch {batch_idx + 1}") - print("Temperatures:", [temp.item() for temp in batch[0]]) - print("Weather (1=sunny, 0=cloudy):", [w.item() for w in batch[1]]) - print() -``` - -The output of the above code will be: - -```shell -Batch 1 -Temperatures: [17, 23, 21] -Weather (1=sunny, 0=cloudy): [0, 1, 1] - -Batch 2 -Temperatures: [20, 25, 19] -Weather (1=sunny, 0=cloudy): [1, 1, 0] - -Batch 3 -Temperatures: [18, 22] -Weather (1=sunny, 0=cloudy): [0, 1] -``` - -> **Note:** The output order may vary, due to `shuffle=True` randomizing the order of the data. - -1. **Dataset Definition**: A custom dataset is defined to hold the data and labels. -2. **DataLoader Initialization**: The dataset is passed to the `DataLoader`, along with parameters like `batch_size` and `shuffle`. -3. **Batch Iteration**: A `for` loop retrieves each batch, containing input data and corresponding labels. - -This example effectively showcases how PyTorch organizes data into batches and simplifies processing, making it a foundational tool for machine learning workflows. \ No newline at end of file +--- +Title: 'Handling Batches' +Description: 'Batch handling is the process of organizing data into fixed-size groups for efficient computation and processing in PyTorch.' +Subjects: + - 'Machine Learning' + - 'AI' +Tags: + - 'PyTorch' + - 'Batch Processing' + - 'DataLoader' + - 'Neural Networks' +CatalogContent: + - 'intro-to-py-torch-and-neural-networks' + - 'paths/build-a-machine-learning-model' +--- + +**Handling batches** is an essential practice in PyTorch for managing and processing large datasets efficiently. PyTorch simplifies batch handling through the `DataLoader` class. Batch processing groups data samples into fixed-sized subsets, enabling parallel computation, faster training, and better use of GPU resources. This technique is especially critical for deep learning workflows, where training on entire datasets at once is often computationally infeasible. + +## Syntax + +Below is the general syntax for setting up a `DataLoader`: + +```pseudo +from torch.utils.data import DataLoader, Dataset + +# Define a custom dataset +class CustomDataset(Dataset): + def __init__(self, data, labels): + self.data = data + self.labels = labels + + def __len__(self): + return len(self.data) + + def __getitem__(self, idx): + return self.data[idx], self.labels[idx] + +# Instantiate the dataset +dataset = CustomDataset(data=[[1, 2], [3, 4], [5, 6]], labels=[0, 1, 0]) + +# Create a DataLoader for batch processing +dataloader = DataLoader(dataset, batch_size=2, shuffle=True) +``` + +- `batch_size`: The number of samples in each batch. +- `shuffle`: Randomizes the order of data each epoch, ensuring better model generalization. +- `num_workers` (Optional): Specifies the number of subprocesses for data loading to speed up training. + +## Example + +Here is an example that demonstrates how to iterate through batches using `DataLoader`. This code defines a dataset, initializes the DataLoader, and processes the data in batches: + +```py +from torch.utils.data import Dataset, DataLoader + +# Define a custom dataset with temperature readings and weather conditions +class WeatherDataset(Dataset): + def __init__(self): + # Simple weather data: (temperature, is_sunny) + self.data = [ + (20, 1), (25, 1), (18, 0), (22, 1), + (17, 0), (23, 1), (19, 0), (21, 1) + ] + + def __len__(self): + return len(self.data) + + def __getitem__(self, idx): + return self.data[idx] + +# Create dataset and DataLoader +dataset = WeatherDataset() +dataloader = DataLoader(dataset, batch_size=3, shuffle=True) + +# Process batches +for batch_idx, batch in enumerate(dataloader): + print(f"Batch {batch_idx + 1}") + print("Temperatures:", [temp.item() for temp in batch[0]]) + print("Weather (1=sunny, 0=cloudy):", [w.item() for w in batch[1]]) + print() +``` + +The output of the above code will be: + +```shell +Batch 1 +Temperatures: [17, 23, 21] +Weather (1=sunny, 0=cloudy): [0, 1, 1] + +Batch 2 +Temperatures: [20, 25, 19] +Weather (1=sunny, 0=cloudy): [1, 1, 0] + +Batch 3 +Temperatures: [18, 22] +Weather (1=sunny, 0=cloudy): [0, 1] +``` + +> **Note:** The output order may vary, due to `shuffle=True` randomizing the order of the data. + +In the above example, three processes are used: + +1. **Dataset Definition**: A custom dataset is defined to hold the data and labels. +2. **DataLoader Initialization**: The dataset is passed to the `DataLoader`, along with parameters like `batch_size` and `shuffle`. +3. **Batch Iteration**: A `for` loop retrieves each batch, containing input data and corresponding labels. + +This example effectively showcases how PyTorch organizes data into batches and simplifies processing, making it a foundational tool for machine learning workflows.