Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Tutorials and examples #76

Merged
merged 2 commits into from
Oct 25, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 30 additions & 0 deletions examples/intake/intake-test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
import warnings

warnings.filterwarnings("ignore")

import intake
import numpy as np
import pandas as pd
import xarray as xr
#import hvplot.pandas, hvplot.xarray
#import holoviews as hv
from distributed import LocalCluster, Client
from ncar_jobqueue import NCARCluster
#hv.extension('bokeh')


if __name__ == '__main__':

# If not using NCAR HPC, use the LocalCluster
#cluster = LocalCluster()
cluster = NCARCluster()
cluster.scale(10)

client = Client(cluster)

catalog = intake.open_esm_datastore(
'file://examples/intake/resources/pelican-test-intake.json'
)

catalog_subset = catalog.search(variable='FLNS', frequency='monthly')
dsets = catalog_subset.to_dataset_dict()
292 changes: 292 additions & 0 deletions examples/pytorch/BasePelicanPytorch.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,292 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Training and Evaluating a Model with PyTorch, FSSpec, and Remote CSV Data\n",
"\n",
"This notebook demonstrates how to train a simple neural network using PyTorch with data read from remote CSV files over HTTPS using `fsspec`. The example includes data pipelines for both training and test datasets and evaluates the model's accuracy on the test set.\n",
"\n",
"## Install Dependencies\n",
"\n",
"```python\n",
"!pip install torch fsspec pandas torchdata"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Import Libraries"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import torch\n",
"import torch.nn as nn\n",
"import torch.optim as optim\n",
"import pandas as pd\n",
"import fsspec\n",
"from torch.utils.data import Dataset, DataLoader\n",
"import numpy as np"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Define the Nueral Network"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Define a simple feedforward nueral network for the example"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"class SimpleNN(nn.Module):\n",
" def __init__(self):\n",
" super(SimpleNN, self).__init__()\n",
" self.fc1 = nn.Linear(784, 50)\n",
" self.fc2 = nn.Linear(50, 600)\n",
"\n",
" def forward(self, x):\n",
" x = torch.relu(self.fc1(x))\n",
" x = self.fc2(x)\n",
" return x\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Define the Custom Dataset"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Create a custom dataset for PyTorch"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
"class CSVDataset(Dataset):\n",
" def __init__(self, data):\n",
" self.data = data\n",
" \n",
" def __len__(self):\n",
" return len(self.data)\n",
" \n",
" def __getitem__(self, index):\n",
" return self.data[index]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Define Functions to Read and Process Remote CSV Data"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"This will use the fsspec to read and process data.\n",
"\n",
"Note that this notebook isn't using the fsspec handling functions built into torchdata.datapipes because that package is being deprecated"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
"def read_csv_from_url(file_url):\n",
" # Create a filesystem object for HTTPS\n",
" fs = fsspec.filesystem('osdf')\n",
" # Open the remote file\n",
" with fs.open(file_url, 'r') as f:\n",
" # Read the file into a pandas DataFrame\n",
" df = pd.read_csv(f, index_col=False)\n",
" return df\n",
"\n",
"def dataframe_to_dataset(df):\n",
" features = df.iloc[:, :-1].values.astype(np.float32) # Assuming last column is target\n",
" targets = df.iloc[:, -1].values.astype(np.int64)\n",
" dataset = [(torch.tensor(feature), torch.tensor(target)) for feature, target in zip(features, targets)]\n",
" return dataset\n",
"\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Prepare the Data"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Get the data remotely from Pelican using fsspec with the 'osdf' protocol. (Note that the OSDF protocol is a specific version of PelicanFS with the discoverURL alreayd set)"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [],
"source": [
"# Define remote file URLs\n",
"train_csv_url = '/chtc/PUBLIC/hzhao292/fashion-mnist_train.csv'\n",
"test_csv_url = '/chtc/PUBLIC/hzhao292/fashion-mnist_test.csv'\n",
"\n",
"# Read and convert data\n",
"train_df = read_csv_from_url(train_csv_url)\n",
"test_df = read_csv_from_url(test_csv_url)\n",
"train_data = dataframe_to_dataset(train_df)\n",
"test_data = dataframe_to_dataset(test_df)\n",
"\n",
"# Create DataLoaders\n",
"train_dataset = CSVDataset(train_data)\n",
"test_dataset = CSVDataset(test_data)\n",
"train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)\n",
"test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Train the model"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Train our example model using the data from Pelican."
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"ename": "IndexError",
"evalue": "Target 8 is out of bounds.",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mIndexError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[0;32mIn[18], line 14\u001b[0m\n\u001b[1;32m 12\u001b[0m optimizer\u001b[38;5;241m.\u001b[39mzero_grad()\n\u001b[1;32m 13\u001b[0m outputs \u001b[38;5;241m=\u001b[39m model(batch_X)\n\u001b[0;32m---> 14\u001b[0m loss \u001b[38;5;241m=\u001b[39m \u001b[43mcriterion\u001b[49m\u001b[43m(\u001b[49m\u001b[43moutputs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mbatch_y\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 15\u001b[0m loss\u001b[38;5;241m.\u001b[39mbackward()\n\u001b[1;32m 16\u001b[0m optimizer\u001b[38;5;241m.\u001b[39mstep()\n",
"File \u001b[0;32m~/pelican/PelicanPytorchTutorial/.venv/lib/python3.9/site-packages/torch/nn/modules/module.py:1553\u001b[0m, in \u001b[0;36mModule._wrapped_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1551\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compiled_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs) \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[1;32m 1552\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1553\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_call_impl\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
"File \u001b[0;32m~/pelican/PelicanPytorchTutorial/.venv/lib/python3.9/site-packages/torch/nn/modules/module.py:1562\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1557\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m 1558\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m 1559\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m 1560\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m 1561\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1562\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1564\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m 1565\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n",
"File \u001b[0;32m~/pelican/PelicanPytorchTutorial/.venv/lib/python3.9/site-packages/torch/nn/modules/loss.py:1188\u001b[0m, in \u001b[0;36mCrossEntropyLoss.forward\u001b[0;34m(self, input, target)\u001b[0m\n\u001b[1;32m 1187\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mforward\u001b[39m(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;28minput\u001b[39m: Tensor, target: Tensor) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Tensor:\n\u001b[0;32m-> 1188\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mF\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcross_entropy\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43minput\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtarget\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mweight\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mweight\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1189\u001b[0m \u001b[43m \u001b[49m\u001b[43mignore_index\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mignore_index\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mreduction\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mreduction\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1190\u001b[0m \u001b[43m \u001b[49m\u001b[43mlabel_smoothing\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mlabel_smoothing\u001b[49m\u001b[43m)\u001b[49m\n",
"File \u001b[0;32m~/pelican/PelicanPytorchTutorial/.venv/lib/python3.9/site-packages/torch/nn/functional.py:3104\u001b[0m, in \u001b[0;36mcross_entropy\u001b[0;34m(input, target, weight, size_average, ignore_index, reduce, reduction, label_smoothing)\u001b[0m\n\u001b[1;32m 3102\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m size_average \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mor\u001b[39;00m reduce \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 3103\u001b[0m reduction \u001b[38;5;241m=\u001b[39m _Reduction\u001b[38;5;241m.\u001b[39mlegacy_get_string(size_average, reduce)\n\u001b[0;32m-> 3104\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mtorch\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_C\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_nn\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcross_entropy_loss\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43minput\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtarget\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mweight\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m_Reduction\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_enum\u001b[49m\u001b[43m(\u001b[49m\u001b[43mreduction\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mignore_index\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mlabel_smoothing\u001b[49m\u001b[43m)\u001b[49m\n",
"\u001b[0;31mIndexError\u001b[0m: Target 8 is out of bounds."
]
}
],
"source": [
"# Instantiate model, loss function, and optimizer\n",
"model = SimpleNN()\n",
"criterion = nn.CrossEntropyLoss()\n",
"optimizer = optim.SGD(model.parameters(), lr=0.01)\n",
"\n",
"# Training loop\n",
"epochs = 5\n",
"for epoch in range(epochs):\n",
" model.train()\n",
" running_loss = 0.0\n",
" for batch_X, batch_y in train_loader:\n",
" optimizer.zero_grad()\n",
" outputs = model(batch_X)\n",
" loss = criterion(outputs, batch_y)\n",
" loss.backward()\n",
" optimizer.step()\n",
" running_loss += loss.item() * batch_X.size(0)\n",
" \n",
" epoch_loss = running_loss / len(train_loader.dataset)\n",
" print(f'Epoch [{epoch+1}/{epochs}], Loss: {epoch_loss:.4f}')\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Evaluate the Model"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Evaluate the accuracy of the model"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"model.eval()\n",
"correct = 0\n",
"total = 0\n",
"with torch.no_grad():\n",
" for batch_X, batch_y in test_loader:\n",
" outputs = model(batch_X)\n",
" _, predicted = torch.max(outputs, 1)\n",
" total += batch_y.size(0)\n",
" correct += (predicted == batch_y).sum().item()\n",
"\n",
"accuracy = correct / total\n",
"print(f'Accuracy on test data: {accuracy:.4f}')"
]
}
],
"metadata": {
"kernelspec": {
"display_name": ".venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.6"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
12 changes: 12 additions & 0 deletions examples/pytorch/pytorch_with_pelicanfs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
import torch
torch.utils.data.datapipes.utils.common.DILL_AVAILABLE = torch.utils._import_utils.dill_available()
from torchdata.datapipes.iter import IterableWrapper


if __name__ == '__main__':
dp = IterableWrapper(["osdf:///chtc/PUBLIC/eturetsky/data/faces/"]).list_files_by_fsspec()
print(list(dp))

dp = IterableWrapper(["osdf:///chtc/PUBLIC/eturetsky/data/faces/"]).open_files_by_fsspec()
for path, filestream in dp:
print(path, filestream)
Loading