Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add fixtures #673

Merged
merged 20 commits into from
Oct 25, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 7 additions & 11 deletions tests/conftest.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,10 @@
# Copyright 2022 MosaicML LLM Foundry authors
# SPDX-License-Identifier: Apache-2.0

import gc
import os
from typing import List, Optional

import pytest
import torch
from composer.utils import reproducibility

# Allowed options for pytest.mark.world_size()
Expand All @@ -18,6 +16,13 @@
# Enforce deterministic mode before any tests start.
reproducibility.configure_deterministic_mode()

# Add the path of any pytest fixture files you want to make global
pytest_plugins = [
'tests.fixtures.autouse',
'tests.fixtures.models',
'tests.fixtures.data',
]


def _add_option(parser: pytest.Parser,
name: str,
Expand Down Expand Up @@ -78,12 +83,3 @@ def pytest_collection_modifyitems(config: pytest.Config,
def pytest_sessionfinish(session: pytest.Session, exitstatus: int):
if exitstatus == 5:
session.exitstatus = 0 # Ignore no-test-ran errors


@pytest.fixture(autouse=True)
def clear_cuda_cache(request: pytest.FixtureRequest):
"""Clear memory between GPU tests."""
marker = request.node.get_closest_marker('gpu')
if marker is not None and torch.cuda.is_available():
torch.cuda.empty_cache()
gc.collect() # Only gc on GPU tests as it 2x slows down CPU tests
2 changes: 2 additions & 0 deletions tests/fixtures/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
# Copyright 2022 MosaicML LLM Foundry authors
# SPDX-License-Identifier: Apache-2.0
39 changes: 39 additions & 0 deletions tests/fixtures/autouse.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
# Copyright 2022 MosaicML LLM Foundry authors
# SPDX-License-Identifier: Apache-2.0

import gc

import pytest
import torch
from composer.utils import dist, get_device, reproducibility


@pytest.fixture(autouse=True)
irenedea marked this conversation as resolved.
Show resolved Hide resolved
def initialize_dist(request: pytest.FixtureRequest):
"""Initialize the default PyTorch distributed process group for tests."""
# should we just always initialize dist like in train.py?
_default = pytest.mark.world_size(1).mark
world_size = request.node.get_closest_marker('world_size', _default).args[0]
gpu = request.node.get_closest_marker('gpu')
if world_size > 1:
dist.initialize_dist(get_device('gpu' if gpu is not None else 'cpu'))


@pytest.fixture(autouse=True)
def clear_cuda_cache(request: pytest.FixtureRequest):
"""Clear memory between GPU tests."""
marker = request.node.get_closest_marker('gpu')
if marker is not None and torch.cuda.is_available():
torch.cuda.empty_cache()
gc.collect() # Only gc on GPU tests as it 2x slows down CPU tests


@pytest.fixture
def random_seed() -> int:
return 17


@pytest.fixture(autouse=True)
def seed_all(random_seed: int):
"""Sets the seed for reproducibility."""
irenedea marked this conversation as resolved.
Show resolved Hide resolved
reproducibility.seed_all(random_seed)
58 changes: 58 additions & 0 deletions tests/fixtures/data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
# Copyright 2022 MosaicML LLM Foundry authors
# SPDX-License-Identifier: Apache-2.0

from pathlib import Path

from composer.utils import dist
from omegaconf import DictConfig
from pytest import fixture
from torch.utils.data import DataLoader
from transformers import PreTrainedTokenizerBase

from llmfoundry.data.finetuning.dataloader import build_finetuning_dataloader
from tests.data_utils import make_tiny_ft_dataset


@fixture
def tiny_ft_dataset_path(tmp_path: Path, dataset_size: int = 4) -> Path:
irenedea marked this conversation as resolved.
Show resolved Hide resolved
"""Creates a tiny dataset and returns the path."""
tiny_dataset_path = tmp_path / 'test-ift-data-small'
tiny_dataset_path.mkdir(exist_ok=True)
tiny_dataset_file = tiny_dataset_path / 'train.jsonl'
if dist.get_world_size() == 1 or dist.get_global_rank() == 0:
make_tiny_ft_dataset(path=str(tiny_dataset_file), size=dataset_size)
return tiny_dataset_path


@fixture
def tiny_ft_dataloader(tiny_ft_dataset_path: Path,
mpt_tokenizer: PreTrainedTokenizerBase,
max_seq_len: int = 128,
device_batch_size: int = 1) -> DataLoader:
dataloader_cfg = DictConfig({
'name': 'finetuning',
'dataset': {
'hf_name': str(tiny_ft_dataset_path),
'split': 'train',
'max_seq_len': max_seq_len,
'decoder_only_format': True,
'allow_pad_trimming': False,
'packing_ratio': None,
'shuffle': True,
},
'drop_last': False,
'num_workers': 4,
'pin_memory': False,
'prefetch_factor': 2,
'persistent_workers': False,
'timeout': 0
})

dataloader = build_finetuning_dataloader(
dataloader_cfg,
mpt_tokenizer,
device_batch_size,
).dataloader

assert isinstance(dataloader, DataLoader)
return dataloader
70 changes: 70 additions & 0 deletions tests/fixtures/models.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
# Copyright 2022 MosaicML LLM Foundry authors
# SPDX-License-Identifier: Apache-2.0

from typing import Any, Callable

from omegaconf import DictConfig
from pytest import fixture
from transformers import PreTrainedTokenizerBase

from llmfoundry.models.hf.hf_causal_lm import ComposerHFCausalLM
from llmfoundry.models.model_registry import COMPOSER_MODEL_REGISTRY
from llmfoundry.models.mpt.modeling_mpt import ComposerMPTCausalLM
from llmfoundry.utils.builders import build_tokenizer


def _build_model(config: DictConfig, tokenizer: PreTrainedTokenizerBase):
model = COMPOSER_MODEL_REGISTRY[config.name](config, tokenizer)
return model


@fixture
def mpt_tokenizer():
return build_tokenizer('EleutherAI/gpt-neox-20b', {})


@fixture
def build_tiny_mpt(
mpt_tokenizer: PreTrainedTokenizerBase
) -> Callable[..., ComposerMPTCausalLM]:

def build(**kwargs: Any) -> ComposerMPTCausalLM:
config = DictConfig({
'name': 'mpt_causal_lm',
'd_model': 128,
'n_heads': 4,
'n_layers': 2,
'expansion_ratio': 2,
})
config.update(kwargs)
model = _build_model(config, mpt_tokenizer)
assert isinstance(model, ComposerMPTCausalLM)
return model

return build


@fixture
def build_tiny_hf_mpt(
mpt_tokenizer: PreTrainedTokenizerBase
) -> Callable[..., ComposerHFCausalLM]:

def build(**kwargs: Any) -> ComposerHFCausalLM:
config_overrides = {
'd_model': 128,
'n_heads': 4,
'n_layers': 2,
'expansion_ratio': 2,
}
config_overrides.update(kwargs)
config = DictConfig({
'name': 'hf_causal_lm',
'pretrained_model_name_or_path': 'mosaicml/mpt-7b',
'pretrained': False,
'config_overrides': config_overrides,
})
model = _build_model(config, mpt_tokenizer)
assert isinstance(model, ComposerHFCausalLM)
return model

return build
18 changes: 7 additions & 11 deletions tests/test_data_prep_scripts.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,9 @@
# SPDX-License-Identifier: Apache-2.0

import os
import shutil
import sys
from argparse import Namespace
from pathlib import Path

# Add repo root to path so we can import scripts and test it
repo_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
Expand All @@ -13,17 +13,16 @@
from scripts.data_prep.convert_dataset_json import main as main_json


def test_download_script_from_api():
def test_download_script_from_api(tmp_path: Path):
# test calling it directly
path = os.path.join(os.getcwd(), 'my-copy-c4-1')
shutil.rmtree(path, ignore_errors=True)
path = os.path.join(tmp_path, 'my-copy-c4-1')
main_hf(
Namespace(
**{
'dataset': 'c4',
'data_subset': 'en',
'splits': ['val_xsmall'],
'out_root': './my-copy-c4-1',
'out_root': path,
'compression': None,
'concat_tokens': None,
'bos_text': None,
Expand All @@ -32,18 +31,16 @@ def test_download_script_from_api():
'num_workers': None
}))
assert os.path.exists(path)
shutil.rmtree(path, ignore_errors=False)


def test_json_script_from_api():
def test_json_script_from_api(tmp_path: Path):
# test calling it directly
path = os.path.join(os.getcwd(), 'my-copy-arxiv-1')
shutil.rmtree(path, ignore_errors=True)
path = os.path.join(tmp_path, 'my-copy-arxiv-1')
main_json(
Namespace(
**{
'path': 'scripts/data_prep/example_data/arxiv.jsonl',
'out_root': './my-copy-arxiv-1',
'out_root': path,
'compression': None,
'split': 'train',
'concat_tokens': None,
Expand All @@ -53,4 +50,3 @@ def test_json_script_from_api():
'num_workers': None
}))
assert os.path.exists(path)
shutil.rmtree(path, ignore_errors=False)
9 changes: 0 additions & 9 deletions tests/test_flash_triton_torch.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@

import pytest
import torch
from composer.utils import reproducibility
from omegaconf import OmegaConf as om


Expand Down Expand Up @@ -39,8 +38,6 @@ def test_attn_impl(attn_impl_0: str,
if alibi and (attn_impl_0 == 'flash' or attn_impl_1 == 'flash'):
pytest.xfail('flash attn does not support alibi')

reproducibility.seed_all(7)

cfg = om.create({
'attn_impl': 'flash',
'd_model': 128,
Expand Down Expand Up @@ -135,8 +132,6 @@ def test_vs_mha(attn_impl: str, device: str = 'cuda'):
"""Compare diff attn_impl to torch.nn.MultiheadAttention."""
from llmfoundry.models.layers import attention

reproducibility.seed_all(17)

cfg = om.create({
'attn_impl': attn_impl,
'd_model': 256,
Expand Down Expand Up @@ -234,8 +229,6 @@ def test_grouped_attention_heads(attn_impl: str,
"""Ensure grouped_query_attention runs w/ diff n_heads & kv_n_heads."""
from llmfoundry.models.layers import attention

reproducibility.seed_all(17)

cfg = om.create({
'attn_impl': attn_impl,
'd_model': 256,
Expand Down Expand Up @@ -273,8 +266,6 @@ def test_grouped_query_invalid_heads(attn_impl: str, device: str = 'cuda'):
"""Check indivisble combinations of grouped_query_attention."""
from llmfoundry.models.layers import attention

reproducibility.seed_all(17)

cfg = om.create({
'attn_impl': attn_impl,
'd_model': 256,
Expand Down
3 changes: 0 additions & 3 deletions tests/test_hf_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@

import pytest
import torch
from composer.utils import reproducibility
from omegaconf import DictConfig
from omegaconf import OmegaConf as om
from transformers import AutoModelForCausalLM
Expand Down Expand Up @@ -93,8 +92,6 @@ def test_hf_config_override(
with open(conf_path) as f:
test_cfg = om.load(f)

reproducibility.seed_all(test_cfg.seed)

# Build Model
# For fast initialization, use `meta` device
print('Initializing model...')
Expand Down
Loading
Loading