PelicanPlatform · turetske · Oct 25, 2024 · Oct 15, 2024 · Oct 25, 2024
diff --git a/examples/intake/intake-test.py b/examples/intake/intake-test.py
@@ -0,0 +1,30 @@
+import warnings
+
+warnings.filterwarnings("ignore")
+
+import intake
+import numpy as np
+import pandas as pd
+import xarray as xr
+#import hvplot.pandas, hvplot.xarray
+#import holoviews as hv
+from distributed import LocalCluster, Client
+from ncar_jobqueue import NCARCluster
+#hv.extension('bokeh')
+
+
+if __name__ == '__main__':
+
+    # If not using NCAR HPC, use the LocalCluster
+    #cluster = LocalCluster()
+    cluster = NCARCluster()
+    cluster.scale(10)
+
+    client = Client(cluster)
+
+    catalog = intake.open_esm_datastore(
+        'file://examples/intake/resources/pelican-test-intake.json'
+    )
+
+    catalog_subset = catalog.search(variable='FLNS', frequency='monthly')
+    dsets = catalog_subset.to_dataset_dict()
diff --git a/examples/pytorch/BasePelicanPytorch.ipynb b/examples/pytorch/BasePelicanPytorch.ipynb
@@ -0,0 +1,292 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Training and Evaluating a Model with PyTorch, FSSpec, and Remote CSV Data\n",
+    "\n",
+    "This notebook demonstrates how to train a simple neural network using PyTorch with data read from remote CSV files over HTTPS using `fsspec`. The example includes data pipelines for both training and test datasets and evaluates the model's accuracy on the test set.\n",
+    "\n",
+    "## Install Dependencies\n",
+    "\n",
+    "```python\n",
+    "!pip install torch fsspec pandas torchdata"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Import Libraries"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import torch\n",
+    "import torch.nn as nn\n",
+    "import torch.optim as optim\n",
+    "import pandas as pd\n",
+    "import fsspec\n",
+    "from torch.utils.data import Dataset, DataLoader\n",
+    "import numpy as np"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Define the Nueral Network"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Define a simple feedforward nueral network for the example"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class SimpleNN(nn.Module):\n",
+    "    def __init__(self):\n",
+    "        super(SimpleNN, self).__init__()\n",
+    "        self.fc1 = nn.Linear(784, 50)\n",
+    "        self.fc2 = nn.Linear(50, 600)\n",
+    "\n",
+    "    def forward(self, x):\n",
+    "        x = torch.relu(self.fc1(x))\n",
+    "        x = self.fc2(x)\n",
+    "        return x\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Define the Custom Dataset"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Create a custom dataset for PyTorch"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class CSVDataset(Dataset):\n",
+    "    def __init__(self, data):\n",
+    "        self.data = data\n",
+    "    \n",
+    "    def __len__(self):\n",
+    "        return len(self.data)\n",
+    "    \n",
+    "    def __getitem__(self, index):\n",
+    "        return self.data[index]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Define Functions to Read and Process Remote CSV Data"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "This will use the fsspec to read and process data.\n",
+    "\n",
+    "Note that this notebook isn't using the fsspec handling functions built into torchdata.datapipes because that package is being deprecated"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def read_csv_from_url(file_url):\n",
+    "    # Create a filesystem object for HTTPS\n",
+    "    fs = fsspec.filesystem('osdf')\n",
+    "    # Open the remote file\n",
+    "    with fs.open(file_url, 'r') as f:\n",
+    "        # Read the file into a pandas DataFrame\n",
+    "        df = pd.read_csv(f, index_col=False)\n",
+    "    return df\n",
+    "\n",
+    "def dataframe_to_dataset(df):\n",
+    "    features = df.iloc[:, :-1].values.astype(np.float32)  # Assuming last column is target\n",
+    "    targets = df.iloc[:, -1].values.astype(np.int64)\n",
+    "    dataset = [(torch.tensor(feature), torch.tensor(target)) for feature, target in zip(features, targets)]\n",
+    "    return dataset\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Prepare the Data"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Get the data remotely from Pelican using fsspec with the 'osdf' protocol. (Note that the OSDF protocol is a specific version of PelicanFS with the discoverURL alreayd set)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Define remote file URLs\n",
+    "train_csv_url = '/chtc/PUBLIC/hzhao292/fashion-mnist_train.csv'\n",
+    "test_csv_url = '/chtc/PUBLIC/hzhao292/fashion-mnist_test.csv'\n",
+    "\n",
+    "# Read and convert data\n",
+    "train_df = read_csv_from_url(train_csv_url)\n",
+    "test_df = read_csv_from_url(test_csv_url)\n",
+    "train_data = dataframe_to_dataset(train_df)\n",
+    "test_data = dataframe_to_dataset(test_df)\n",
+    "\n",
+    "# Create DataLoaders\n",
+    "train_dataset = CSVDataset(train_data)\n",
+    "test_dataset = CSVDataset(test_data)\n",
+    "train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)\n",
+    "test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Train the model"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Train our example model using the data from Pelican."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "IndexError",
+     "evalue": "Target 8 is out of bounds.",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mIndexError\u001b[0m                                Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[18], line 14\u001b[0m\n\u001b[1;32m     12\u001b[0m optimizer\u001b[38;5;241m.\u001b[39mzero_grad()\n\u001b[1;32m     13\u001b[0m outputs \u001b[38;5;241m=\u001b[39m model(batch_X)\n\u001b[0;32m---> 14\u001b[0m loss \u001b[38;5;241m=\u001b[39m \u001b[43mcriterion\u001b[49m\u001b[43m(\u001b[49m\u001b[43moutputs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mbatch_y\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m     15\u001b[0m loss\u001b[38;5;241m.\u001b[39mbackward()\n\u001b[1;32m     16\u001b[0m optimizer\u001b[38;5;241m.\u001b[39mstep()\n",
+      "File \u001b[0;32m~/pelican/PelicanPytorchTutorial/.venv/lib/python3.9/site-packages/torch/nn/modules/module.py:1553\u001b[0m, in \u001b[0;36mModule._wrapped_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1551\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compiled_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)  \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[1;32m   1552\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1553\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_call_impl\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m~/pelican/PelicanPytorchTutorial/.venv/lib/python3.9/site-packages/torch/nn/modules/module.py:1562\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1557\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m   1558\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m   1559\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m   1560\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m   1561\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1562\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1564\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m   1565\u001b[0m     result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n",
+      "File \u001b[0;32m~/pelican/PelicanPytorchTutorial/.venv/lib/python3.9/site-packages/torch/nn/modules/loss.py:1188\u001b[0m, in \u001b[0;36mCrossEntropyLoss.forward\u001b[0;34m(self, input, target)\u001b[0m\n\u001b[1;32m   1187\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mforward\u001b[39m(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;28minput\u001b[39m: Tensor, target: Tensor) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Tensor:\n\u001b[0;32m-> 1188\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mF\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcross_entropy\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43minput\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtarget\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mweight\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mweight\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1189\u001b[0m \u001b[43m                           \u001b[49m\u001b[43mignore_index\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mignore_index\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mreduction\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mreduction\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1190\u001b[0m \u001b[43m                           \u001b[49m\u001b[43mlabel_smoothing\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mlabel_smoothing\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m~/pelican/PelicanPytorchTutorial/.venv/lib/python3.9/site-packages/torch/nn/functional.py:3104\u001b[0m, in \u001b[0;36mcross_entropy\u001b[0;34m(input, target, weight, size_average, ignore_index, reduce, reduction, label_smoothing)\u001b[0m\n\u001b[1;32m   3102\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m size_average \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mor\u001b[39;00m reduce \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m   3103\u001b[0m     reduction \u001b[38;5;241m=\u001b[39m _Reduction\u001b[38;5;241m.\u001b[39mlegacy_get_string(size_average, reduce)\n\u001b[0;32m-> 3104\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mtorch\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_C\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_nn\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcross_entropy_loss\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43minput\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtarget\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mweight\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m_Reduction\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_enum\u001b[49m\u001b[43m(\u001b[49m\u001b[43mreduction\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mignore_index\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mlabel_smoothing\u001b[49m\u001b[43m)\u001b[49m\n",
+      "\u001b[0;31mIndexError\u001b[0m: Target 8 is out of bounds."
+     ]
+    }
+   ],
+   "source": [
+    "# Instantiate model, loss function, and optimizer\n",
+    "model = SimpleNN()\n",
+    "criterion = nn.CrossEntropyLoss()\n",
+    "optimizer = optim.SGD(model.parameters(), lr=0.01)\n",
+    "\n",
+    "# Training loop\n",
+    "epochs = 5\n",
+    "for epoch in range(epochs):\n",
+    "    model.train()\n",
+    "    running_loss = 0.0\n",
+    "    for batch_X, batch_y in train_loader:\n",
+    "        optimizer.zero_grad()\n",
+    "        outputs = model(batch_X)\n",
+    "        loss = criterion(outputs, batch_y)\n",
+    "        loss.backward()\n",
+    "        optimizer.step()\n",
+    "        running_loss += loss.item() * batch_X.size(0)\n",
+    "    \n",
+    "    epoch_loss = running_loss / len(train_loader.dataset)\n",
+    "    print(f'Epoch [{epoch+1}/{epochs}], Loss: {epoch_loss:.4f}')\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Evaluate the Model"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Evaluate the accuracy of the model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model.eval()\n",
+    "correct = 0\n",
+    "total = 0\n",
+    "with torch.no_grad():\n",
+    "    for batch_X, batch_y in test_loader:\n",
+    "        outputs = model(batch_X)\n",
+    "        _, predicted = torch.max(outputs, 1)\n",
+    "        total += batch_y.size(0)\n",
+    "        correct += (predicted == batch_y).sum().item()\n",
+    "\n",
+    "accuracy = correct / total\n",
+    "print(f'Accuracy on test data: {accuracy:.4f}')"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/examples/pytorch/pytorch_with_pelicanfs.py b/examples/pytorch/pytorch_with_pelicanfs.py
@@ -0,0 +1,12 @@
+import torch
+torch.utils.data.datapipes.utils.common.DILL_AVAILABLE = torch.utils._import_utils.dill_available()
+from torchdata.datapipes.iter import IterableWrapper
+
+
+if __name__ == '__main__':
+    dp = IterableWrapper(["osdf:///chtc/PUBLIC/eturetsky/data/faces/"]).list_files_by_fsspec()
+    print(list(dp))
+
+    dp = IterableWrapper(["osdf:///chtc/PUBLIC/eturetsky/data/faces/"]).open_files_by_fsspec()
+    for path, filestream in dp:
+        print(path, filestream)