Merge pull request #41 from huggingface/xrsrke/setup_cicd

Add CI/CD for unit tests
huggingface · Feb 16, 2024 · fa4685a · fa4685a
2 parents 31aa4c4 + c705f4d
commit fa4685a
Show file tree

Hide file tree

Showing 23 changed files with 517 additions and 163 deletions.
diff --git a/.github/workflows/3d_parallelism_unit_tests.yaml b/.github/workflows/3d_parallelism_unit_tests.yaml
@@ -0,0 +1,63 @@
+name: Run non-FA2-related unit tests
+
+on:
+  push:
+    branches: [ main ]
+    # Only run tests if we modify the following files
+    paths:
+      - "src/**/*.py"
+      - "examples/**/*.py"
+      - "tests/**/*.py"
+
+  pull_request:
+    branches: [ '**' ]
+    paths:
+     - "src/**/*.py"
+     - "examples/**/*.py"
+     - "tests/**/*.py"
+
+jobs:
+  tests:
+    runs-on: [multi-gpu, nvidia-gpu, 8-t4, ci]
+    container:
+      image: runpod/pytorch:2.1.1-py3.10-cuda12.1.1-devel-ubuntu22.04
+      ports:
+        - 80
+      options: --gpus all --shm-size "8G"
+    steps:
+    - uses: actions/checkout@v3
+    - name: Python environment
+      run: |
+        which python
+        python --version
+
+    - name: Check Pytorch version
+      run: |
+        nvidia-smi
+        python -c "import torch; print('torch:', torch.__version__, torch)"
+        python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
+
+    - name: Instal nanotron
+      run: |
+        python -m pip install --upgrade pip
+        pip install packaging
+        pip install wheel
+        pip install -e .
+        pip install -e .[dev]
+        pip install -e .[test]
+
+    - name: Show installed libraries and their versions
+      run: pip freeze | tee installed.txt
+
+    - name: Run tests
+      # NOTE: -m "not fa2" will run all the unit tests that don't have the mark
+      # "fa2" (these are FA2-related tests, we can't run it on T4)
+      run: |
+        pytest \
+        -m "not fa2" \
+        --color=yes \
+        --durations=0 \
+        --ignore tests/kernels \
+        --ignore tests/fp8 \
+        --verbose \
+        tests/
diff --git a/.github/workflows/code_quality.yaml b/.github/workflows/code_quality.yaml
@@ -0,0 +1,26 @@
+name: Code Quality
+
+on:
+  workflow_dispatch:
+  push:
+    branches: [ main ]
+    # Only run tests if we modify the following files
+    paths:
+      - "src/**/*.py"
+
+  pull_request:
+    branches: [ '**' ]
+    paths:
+     - "src/**/*.py"
+
+jobs:
+  cloc:
+    runs-on: ubuntu-latest
+
+    steps:
+    - uses: actions/checkout@v3
+
+    - name: Count Lines of Code (cloc)
+      uses: djdefi/cloc-action@6
+      with:
+        options: --by-file-by-lang --exclude-dir=docs,tests,examples --exclude-lang=YAML,Markdown,TOML --exclude-list-file=sanity_checks.py
diff --git a/.github/workflows/fa2_unit_tests.yaml b/.github/workflows/fa2_unit_tests.yaml
@@ -0,0 +1,58 @@
+name: Run FA2-related unit tests
+
+on:
+  workflow_dispatch:
+  push:
+    branches: [ main ]
+    # Only run tests if we modify the following files
+    paths:
+      - "src/**/*.py"
+      - "examples/**/*.py"
+      - "tests/**/*.py"
+
+  pull_request:
+    branches: [ '**' ]
+    paths:
+     - "src/**/*.py"
+     - "examples/**/*.py"
+     - "tests/**/*.py"
+
+jobs:
+  tests:
+    runs-on: [single-gpu, nvidia-gpu, a10, ci]
+    container:
+      image: runpod/pytorch:2.1.1-py3.10-cuda12.1.1-devel-ubuntu22.04
+      ports:
+        - 80
+      options: --gpus all --shm-size "8G"
+    steps:
+    - uses: actions/checkout@v3
+
+    - name: Python environment
+      run: |
+        which python
+        python --version
+
+    - name: Check Pytorch version
+      run: |
+        nvidia-smi
+        python -c "import torch; print('torch:', torch.__version__, torch)"
+        python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
+
+    - name: Instal nanotron
+      run: |
+        python -m pip install --upgrade pip
+        pip install packaging
+        pip install wheel
+        pip install "flash-attn>=2.5.0" --no-build-isolation
+        pip install -e .
+        pip install -e .[dev]
+        pip install -e .[test]
+
+    - name: Show installed libraries and their versions
+      run: pip freeze | tee installed.txt
+
+    - name: Run tests
+      # NOTE: -m fa2 will only run the unit tests that have the mark
+      # "fa2" (these are FA2-related tests)
+      run: pytest -m fa2 --color=yes --durations=0 --ignore tests/fp8 --verbose tests/
diff --git a/.gitignore b/.gitignore
@@ -160,6 +160,5 @@ cython_debug/
 #.idea/
 
 .vscode
-.github
 
 checkpoints/
diff --git a/src/nanotron/distributed.py b/src/nanotron/distributed.py
@@ -9,6 +9,8 @@
 from torch.distributed import *  # noqa
 from torch.distributed.distributed_c10d import ProcessGroup
 
+from nanotron.utils import find_free_port
+
 torch_version_above_1_13 = version.parse(torch.__version__) >= version.parse("1.13.0")
 Work = dist.Work if torch_version_above_1_13 else dist._Work
 default_pg_timeout = datetime.timedelta(minutes=10)
@@ -257,5 +259,15 @@ def initialize_torch_distributed():
         backend = "gloo"
 
     # Call the init process.
-    dist.init_process_group(backend=backend, world_size=world_size, rank=rank, timeout=dist.default_pg_timeout)
+
+    port = os.getenv("MASTER_PORT")
+    if port is None:
+        port = find_free_port()
+    else:
+        port = int(port)
+
+    init_method = f"env://localhost:{port}"
+    dist.init_process_group(
+        init_method=init_method, backend=backend, world_size=world_size, rank=rank, timeout=dist.default_pg_timeout
+    )
     return True
diff --git a/src/nanotron/optim/clip_grads.py b/src/nanotron/optim/clip_grads.py
@@ -56,7 +56,7 @@ def clip_grad_norm(
                 torch.stack([torch.linalg.vector_norm(g.detach(), ord=torch.inf, dtype=torch.float) for g in grads])
             )
         else:
-            total_norm = torch.zeros(1, dtype=torch.float, device=torch.device("cuda"))
+            total_norm = torch.zeros([], dtype=torch.float, device=torch.device("cuda"))
         dist.all_reduce(total_norm, group=mp_pg, op=dist.ReduceOp.MAX)
 
     else:
@@ -68,7 +68,7 @@ def clip_grad_norm(
                 dtype=torch.float,
             ).pow(norm_type)
         else:
-            total_norm = torch.zeros(1, dtype=torch.float, device=torch.device("cuda"))
+            total_norm = torch.zeros([], dtype=torch.float, device=torch.device("cuda"))
         dist.all_reduce(total_norm, group=mp_pg, op=dist.ReduceOp.SUM)
         total_norm.pow_(1.0 / norm_type)
 

diff --git a/src/nanotron/parallel/context.py b/src/nanotron/parallel/context.py
@@ -35,7 +35,7 @@ def __init__(
             )
 
         if not dist.is_available():
-            raise ValueError("`torch.distributed is not available as a package, please install it.")
+            raise ValueError("torch.distributed is not available as a package, please install it.")
 
         self.tensor_parallel_size = tensor_parallel_size
         self.pipeline_parallel_size = pipeline_parallel_size
@@ -148,3 +148,10 @@ def get_3d_ranks(self, world_rank: int) -> Tuple[int, int, int]:
         dp_rank = (world_rank // self.tp_pg.size()) % self.dp_pg.size()
         tp_rank = world_rank % self.tp_pg.size()
         return (pp_rank, dp_rank, tp_rank)
+
+    def destroy(self):
+        if not dist.is_initialized():
+            return
+
+        dist.barrier()
+        dist.destroy_process_group()
diff --git a/src/nanotron/utils.py b/src/nanotron/utils.py
@@ -2,6 +2,8 @@
 import inspect
 import math
 import os
+import random
+import socket
 from contextlib import ExitStack, contextmanager
 from typing import Callable, ContextManager, List, Optional
 
@@ -147,3 +149,15 @@ def tensor_from_untyped_storage(untyped_storage: torch.UntypedStorage, dtype: to
     tensor = torch.empty([], dtype=dtype, device=device)
     tensor.set_(source=untyped_storage)
     return tensor
+
+
+def find_free_port(min_port: int = 2000, max_port: int = 65000) -> int:
+    while True:
+        port = random.randint(min_port, max_port)
+        try:
+            with socket.socket() as sock:
+                sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
+                sock.bind(("localhost", port))
+                return port
+        except OSError:
+            continue
-Original file line number
+Diff line change
@@ Expand Up / @@ -160,6 +160,5 @@ cython_debug/ @@
     #.idea/
     .vscode
-    .github
     checkpoints/