Skip to content

Commit

Permalink
Merge pull request #41 from huggingface/xrsrke/setup_cicd
Browse files Browse the repository at this point in the history
Add CI/CD for unit tests
  • Loading branch information
xrsrke authored Feb 16, 2024
2 parents 31aa4c4 + c705f4d commit fa4685a
Show file tree
Hide file tree
Showing 23 changed files with 517 additions and 163 deletions.
63 changes: 63 additions & 0 deletions .github/workflows/3d_parallelism_unit_tests.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
name: Run non-FA2-related unit tests

on:
push:
branches: [ main ]
# Only run tests if we modify the following files
paths:
- "src/**/*.py"
- "examples/**/*.py"
- "tests/**/*.py"

pull_request:
branches: [ '**' ]
paths:
- "src/**/*.py"
- "examples/**/*.py"
- "tests/**/*.py"

jobs:
tests:
runs-on: [multi-gpu, nvidia-gpu, 8-t4, ci]
container:
image: runpod/pytorch:2.1.1-py3.10-cuda12.1.1-devel-ubuntu22.04
ports:
- 80
options: --gpus all --shm-size "8G"
steps:
- uses: actions/checkout@v3
- name: Python environment
run: |
which python
python --version
- name: Check Pytorch version
run: |
nvidia-smi
python -c "import torch; print('torch:', torch.__version__, torch)"
python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
- name: Instal nanotron
run: |
python -m pip install --upgrade pip
pip install packaging
pip install wheel
pip install -e .
pip install -e .[dev]
pip install -e .[test]
- name: Show installed libraries and their versions
run: pip freeze | tee installed.txt

- name: Run tests
# NOTE: -m "not fa2" will run all the unit tests that don't have the mark
# "fa2" (these are FA2-related tests, we can't run it on T4)
run: |
pytest \
-m "not fa2" \
--color=yes \
--durations=0 \
--ignore tests/kernels \
--ignore tests/fp8 \
--verbose \
tests/
26 changes: 26 additions & 0 deletions .github/workflows/code_quality.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
name: Code Quality

on:
workflow_dispatch:
push:
branches: [ main ]
# Only run tests if we modify the following files
paths:
- "src/**/*.py"

pull_request:
branches: [ '**' ]
paths:
- "src/**/*.py"

jobs:
cloc:
runs-on: ubuntu-latest

steps:
- uses: actions/checkout@v3

- name: Count Lines of Code (cloc)
uses: djdefi/cloc-action@6
with:
options: --by-file-by-lang --exclude-dir=docs,tests,examples --exclude-lang=YAML,Markdown,TOML --exclude-list-file=sanity_checks.py
58 changes: 58 additions & 0 deletions .github/workflows/fa2_unit_tests.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
name: Run FA2-related unit tests

on:
workflow_dispatch:
push:
branches: [ main ]
# Only run tests if we modify the following files
paths:
- "src/**/*.py"
- "examples/**/*.py"
- "tests/**/*.py"

pull_request:
branches: [ '**' ]
paths:
- "src/**/*.py"
- "examples/**/*.py"
- "tests/**/*.py"

jobs:
tests:
runs-on: [single-gpu, nvidia-gpu, a10, ci]
container:
image: runpod/pytorch:2.1.1-py3.10-cuda12.1.1-devel-ubuntu22.04
ports:
- 80
options: --gpus all --shm-size "8G"
steps:
- uses: actions/checkout@v3

- name: Python environment
run: |
which python
python --version
- name: Check Pytorch version
run: |
nvidia-smi
python -c "import torch; print('torch:', torch.__version__, torch)"
python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
- name: Instal nanotron
run: |
python -m pip install --upgrade pip
pip install packaging
pip install wheel
pip install "flash-attn>=2.5.0" --no-build-isolation
pip install -e .
pip install -e .[dev]
pip install -e .[test]
- name: Show installed libraries and their versions
run: pip freeze | tee installed.txt

- name: Run tests
# NOTE: -m fa2 will only run the unit tests that have the mark
# "fa2" (these are FA2-related tests)
run: pytest -m fa2 --color=yes --durations=0 --ignore tests/fp8 --verbose tests/
1 change: 0 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -160,6 +160,5 @@ cython_debug/
#.idea/

.vscode
.github

checkpoints/
14 changes: 13 additions & 1 deletion src/nanotron/distributed.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@
from torch.distributed import * # noqa
from torch.distributed.distributed_c10d import ProcessGroup

from nanotron.utils import find_free_port

torch_version_above_1_13 = version.parse(torch.__version__) >= version.parse("1.13.0")
Work = dist.Work if torch_version_above_1_13 else dist._Work
default_pg_timeout = datetime.timedelta(minutes=10)
Expand Down Expand Up @@ -257,5 +259,15 @@ def initialize_torch_distributed():
backend = "gloo"

# Call the init process.
dist.init_process_group(backend=backend, world_size=world_size, rank=rank, timeout=dist.default_pg_timeout)

port = os.getenv("MASTER_PORT")
if port is None:
port = find_free_port()
else:
port = int(port)

init_method = f"env://localhost:{port}"
dist.init_process_group(
init_method=init_method, backend=backend, world_size=world_size, rank=rank, timeout=dist.default_pg_timeout
)
return True
4 changes: 2 additions & 2 deletions src/nanotron/optim/clip_grads.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ def clip_grad_norm(
torch.stack([torch.linalg.vector_norm(g.detach(), ord=torch.inf, dtype=torch.float) for g in grads])
)
else:
total_norm = torch.zeros(1, dtype=torch.float, device=torch.device("cuda"))
total_norm = torch.zeros([], dtype=torch.float, device=torch.device("cuda"))
dist.all_reduce(total_norm, group=mp_pg, op=dist.ReduceOp.MAX)

else:
Expand All @@ -68,7 +68,7 @@ def clip_grad_norm(
dtype=torch.float,
).pow(norm_type)
else:
total_norm = torch.zeros(1, dtype=torch.float, device=torch.device("cuda"))
total_norm = torch.zeros([], dtype=torch.float, device=torch.device("cuda"))
dist.all_reduce(total_norm, group=mp_pg, op=dist.ReduceOp.SUM)
total_norm.pow_(1.0 / norm_type)

Expand Down
9 changes: 8 additions & 1 deletion src/nanotron/parallel/context.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ def __init__(
)

if not dist.is_available():
raise ValueError("`torch.distributed is not available as a package, please install it.")
raise ValueError("torch.distributed is not available as a package, please install it.")

self.tensor_parallel_size = tensor_parallel_size
self.pipeline_parallel_size = pipeline_parallel_size
Expand Down Expand Up @@ -148,3 +148,10 @@ def get_3d_ranks(self, world_rank: int) -> Tuple[int, int, int]:
dp_rank = (world_rank // self.tp_pg.size()) % self.dp_pg.size()
tp_rank = world_rank % self.tp_pg.size()
return (pp_rank, dp_rank, tp_rank)

def destroy(self):
if not dist.is_initialized():
return

dist.barrier()
dist.destroy_process_group()
14 changes: 14 additions & 0 deletions src/nanotron/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
import inspect
import math
import os
import random
import socket
from contextlib import ExitStack, contextmanager
from typing import Callable, ContextManager, List, Optional

Expand Down Expand Up @@ -147,3 +149,15 @@ def tensor_from_untyped_storage(untyped_storage: torch.UntypedStorage, dtype: to
tensor = torch.empty([], dtype=dtype, device=device)
tensor.set_(source=untyped_storage)
return tensor


def find_free_port(min_port: int = 2000, max_port: int = 65000) -> int:
while True:
port = random.randint(min_port, max_port)
try:
with socket.socket() as sock:
sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
sock.bind(("localhost", port))
return port
except OSError:
continue
Loading

0 comments on commit fa4685a

Please sign in to comment.