Make torch examples to use ray (#219)

* Replace all torch examples with self contained code. Co-authored-by: Alex Samylkin <alsamylk@microsoft.com>
microsoft · Sep 11, 2023 · a707bfb · a707bfb
1 parent e40343c
commit a707bfb
Show file tree

Hide file tree

Showing 70 changed files with 2,068 additions and 8,196 deletions.
diff --git a/.github/actions/test/action.yml b/.github/actions/test/action.yml
@@ -18,7 +18,6 @@ runs:
 
     - run: |
         bazel test -c dbg //src/cc/tests:* --test_output=all --test_timeout 30 --config=${{ env.BAZEL_CONFIG }} ${{ env.REMOTE_CACHE }} --verbose_failures
-        bazel clean --expunge
       shell: bash
       name: run cpp tests
     - run: |
@@ -27,12 +26,17 @@ runs:
       shell: bash
       name: run python tests
     - run: |
-        bazel test -c dbg //examples/pytorch/...:* --jobs 1 --test_output=all --test_timeout 6000 --config=${{ env.BAZEL_CONFIG }} ${{ env.REMOTE_CACHE }}  --verbose_failures
+        bazel run -c dbg //examples/pytorch:aml --config=${{ env.BAZEL_CONFIG }}
+        bazel run -c dbg //examples/pytorch:gcn --config=${{ env.BAZEL_CONFIG }}
+        bazel run -c dbg //examples/pytorch:gat --config=${{ env.BAZEL_CONFIG }}
+        bazel run -c dbg //examples/pytorch:tgn --config=${{ env.BAZEL_CONFIG }}
+        bazel run -c dbg //examples/pytorch/hetgnn:main --config=${{ env.BAZEL_CONFIG }}
+        bazel run -c dbg //examples/pytorch:sage --config=${{ env.BAZEL_CONFIG }}
         bazel test -c dbg //examples/tensorflow/...:* --jobs 1 --test_output=all --test_timeout 6000 --config=${{ env.BAZEL_CONFIG }} ${{ env.REMOTE_CACHE }}  --verbose_failures
         bazel test -c dbg //docs:* --test_output=all --jobs 1 --config=${{ env.BAZEL_CONFIG }} ${{ env.REMOTE_CACHE }}  --verbose_failures
         bazel clean --expunge
       shell: bash
-      name: run python tests in examples and docs folders
+      name: run python examples and doctests
       if: runner.os == 'Linux'
     - run: |
         bazel run -c dbg //docs:make_docs --config=linux ${{ env.REMOTE_CACHE }}  --verbose_failures

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -124,50 +124,3 @@ jobs:
         run: cd ./tensorflow/han/ && sudo chmod +x *.sh && bash ./run.sh CPU cleanup
       - name: Run gat examples
         run: cd ./tensorflow/gat/ && sudo chmod +x *.sh && bash ./run.sh CPU cleanup
-  torch_examples:
-    needs: wheel
-    strategy:
-      matrix:
-        python-version: ["3.10"]
-        os: ["ubuntu-22.04"]
-    runs-on: ${{ matrix.os }}
-    env:
-      HOROVOD_WITH_GLOO: 1
-      HOROVOD_WITHOUT_MPI: 1
-      HOROVOD_WITHOUT_TENSORFLOW: 1
-      HOROVOD_WITH_PYTORCH: 1
-      HOROVOD_WITHOUT_MXNET: 1
-    steps:
-      - name: Download wheels
-        uses: actions/download-artifact@v3.0.2
-        with:
-          name: deepgnn
-      - name: Download examples
-        uses: actions/download-artifact@v3.0.2
-        with:
-          name: examples
-      - name: Set up Python ${{ matrix.python-version }}
-        uses: actions/setup-python@v4.7.0
-        with:
-          python-version: ${{ matrix.python-version }}
-      - run: |
-          pip install --upgrade pip
-          pip install torch==1.13.1 packaging==21.3 setuptools==67.8.0 cmake==3.27.1
-          pip install --timeout 300 -r pytorch/requirements.txt
-        name: Install dependencies
-      - name: Uninstall cached deepgnn packages
-        run: pip uninstall deepgnn-ge deepgnn-torch
-      - name: Install wheels
-        run: pip install deepgnn_ge*.whl deepgnn_torch*.whl
-      - name: Run gat examples
-        run: cd ./pytorch/gat/ && sudo chmod +x *.sh && bash ./run.sh CPU
-      - name: Run pytorch geometric examples
-        run: cd ./pytorch/geometric/gat/ && sudo chmod +x *.sh && bash ./run.sh CPU
-      - name: Run graphsage supervised examples
-        run: cd ./pytorch/graphsage/ && sudo chmod +x *.sh && bash ./run.sh supervised no CPU && bash ./run.sh supervised no CPU disk yes
-      - name: Run graphsage unsupervised examples
-        run: cd ./pytorch/graphsage/ && sudo chmod +x *.sh && bash ./run.sh unsupervised no CPU
-      - name: Run hetgnn examples
-        run: cd ./pytorch/hetgnn/ && sudo chmod +x *.sh && bash ./run.sh CPU
-      - name: Run AML example
-        run: cd ./pytorch/ && sudo chmod +x *.py && python aml.py --unit_test
diff --git a/.mypy.ini b/.mypy.ini
@@ -78,3 +78,6 @@ ignore_missing_imports = True
 
 [mypy-tenacity.*]
 ignore_missing_imports = True
+
+[mypy-torch_sparse.tensor.*]
+ignore_missing_imports = True
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -9,11 +9,19 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ### Added
 - Add `return_edge_created_ts` argument to neighbor sampling methods to return timestamps when edges connecting nodes were created.
 - `MOOC` temporal dataset.
+- TGN example.
+- GCN example.
 
 ### Fixed
 - Uniform sampling works in temporal graphs.
 - ADL path parsing to download graph data.
 
+### Changed
+- Changed pytorch examples to be self contained and use [Ray](https://www.ray.io/) for distributed training.
+
+### Removed
+- link prediction and knowledgegraph examples
+
 ## [0.1.60] - 2022-04-18
 
 ### Added

diff --git a/examples/pytorch/BUILD b/examples/pytorch/BUILD
@@ -1,12 +1,73 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT License.
+load("@rules_python//python:defs.bzl", "py_binary")
+load("@pip_deps//:requirements.bzl", "requirement")
 
-load("@rules_python//python:defs.bzl", "py_library")
+common_deps = [
+    "//src/python/deepgnn:deepgnn_ge_wheel_library",
+    requirement("azure-datalake-store"),
+    requirement("fsspec"),
+    requirement("grpcio"),
+    requirement("msgpack"),
+    requirement("numpy"),
+    requirement("networkx"),
+    requirement("opencensus"),
+    requirement("opencensus-context"),
+    requirement("opencensus-ext-azure"),
+    requirement("packaging"),
+    requirement("pyyaml"),
+    requirement("ray"),
+    requirement("referencing"),
+    requirement("rpds"),
+    requirement("rpds-py"),
+    requirement("scikit-learn"),
+    requirement("torch"),
+    requirement("torch_geometric"),
+    requirement("tenacity"),
+]
 
-py_library(
-    name = "example_torch_tests",
+sparse_deps = common_deps + [
+    requirement("torch-sparse"),
+    requirement("torch-scatter"),
+    requirement("torch-cluster"),
+]
+
+py_binary(
+    name = "gcn",
+    srcs = [
+        "gcn.py",
+    ],
+    deps = sparse_deps,
+)
+
+py_binary(
+    name = "gat",
+    srcs = [
+        "gat.py",
+    ],
+    deps = common_deps,
+)
+
+py_binary(
+    name = "tgn",
+    srcs = [
+        "tgn.py",
+    ],
+    deps = sparse_deps,
+)
+
+py_binary(
+    name = "sage",
+    srcs = [
+        "sage.py",
+    ],
+    deps = sparse_deps,
+)
+
+py_binary(
+    name = "aml",
     srcs = [
-        "conftest.py",
+        "aml.py",
     ],
-    visibility = ["//visibility:public"],
+    deps = common_deps,
 )
diff --git a/examples/pytorch/README.md b/examples/pytorch/README.md
@@ -0,0 +1,54 @@
+# Introduction
+
+This directory contains example models that can be trained with DeepGNN. To train and evaluate any model use `python {model_name}.py`. For example, `python sage.py` with train a sage model with default arguments.
+
+# GAT
+
+Graph attention networks (GATs) leverages masked self-attentional layers to address the shortcomings of prior methods based on graph convolutions or their approximations. By stacking layers in which nodes are able to attend over their neighborhoods’ features, we enable (implicitly) specifying different weights to different nodes in a neighborhood, without requiring any kind of costly matrix operation (such as inversion) or depending on knowing the graph structure upfront.
+
+- Reference : [https://arxiv.org/abs/1710.10903](https://arxiv.org/abs/1710.10903)
+- Author's code: [https://github.com/PetarV-/GAT](https://github.com/PetarV-/GAT)
+- `gat.py` contains GAT model implementation based on pytorch-geometric layers with a full subgraph minibatches.
+
+# SAGE
+GraphSAGE is a framework for inductive representation learning on large graphs. GraphSAGE is used to generate low-dimensional vector representations for nodes, and is especially useful for graphs that have rich node attribute information.
+Reference: [Inductive Representation Learning on Large Graphs](https://cs.stanford.edu/people/jure/pubs/graphsage-nips17.pdf)
+
+- `sage.py` contains an unsupervised graphsage model with pytorch-geometric layers trained subgraph with sampled neighbors from Cora dataset.
+
+# HetGNN
+
+Heterogeneous graphs contain abundant information with structural relations (edges) among multi-typed nodes as well as unstructured content associated with each node.
+
+HetGNN introduces a random walk with restart strategy to sample a fixed size of strongly correlated heterogeneous neighbors for each node and group them based upon node types. Next, it designs a neural network architecture with two modules to aggregate feature information of those sampled neighboring nodes. The first module encodes “deep” feature interactions of heterogeneous contents and generates content embedding for each node. The second module aggregates content (attribute) embeddings of different neighboring groups (types) and further combines them by considering the impacts of different groups to obtain the ultimate node embedding. Finally, it  leverage a graph context loss and a mini-batch gradient descent procedure to train the model in an end-to-end manner.
+
+- Reference: [Heterogeneous Graph Neural Network](https://www3.nd.edu/~dial/publications/zhang_2019_heterogeneous.pdf)
+- `hetgnn/main.py` is the script to train a HetGNN model on academic graph.
+- `hetgnn/graph.py` contains a pure python implementation of DeepGNN graph API needed to train the model above.
+
+
+# TGN
+
+Temporal Graph Networks(TGNs) is a generic framework for deep learning on dynamic graphs represented as sequences of timed events.
+TGNs are made of an encoder-decoder pair that transforms dynamic graphs into node embeddings and makes task-specific predictions.
+The TGN encoder operates on continuous-time dynamic graphs and translates time-stamped events into node embeddings. Its core modules
+include a memory function to retain a node's history, message functions to compute updates to a node's memory during an event, and an
+embedding module to tackle the staleness problem, allowing up-to-date embeddings even when a node has been inactive for a while.
+Aggregation and memory update functions are used to manage messages related to nodes, and multiple formulations for embedding are provided,
+including Temporal Graph Attention and Temporal Graph Sum.
+
+- Reference: [https://arxiv.org/abs/2006.10637](https://arxiv.org/abs/2006.10637)
+- `tgn.py` contains TGN model implementation with pytorch-geometric modules and temporal graph based on MOOC dataset.
+
+# GCN
+
+Graph Convolutional Networks operate directly on graphs via a localized first-order approximation of spectral graph convolutions.
+The model scales linearly in the number of graph edges and learns hidden layer representations that encode both local graph structure and features of nodes.
+
+- Reference: [https://arxiv.org/abs/1609.02907](https://arxiv.org/abs/1609.02907)
+- `gcn.py` contains GCN model implementation with pytorch-geometric modules and distributed training with 2 [Ray](https://www.ray.io/) workers.
+
+# AML
+
+Graph engine servers must start before workers clients and reverse is also true, servers have to wait for all clients to disconnect before shutdown.
+`aml.py` exmaple shows how to do this with Azure ML workflows.