Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Reenable tests that were accidentally disabled #746

Merged
merged 8 commits into from
Nov 18, 2023
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
187 changes: 75 additions & 112 deletions tests/test_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -304,17 +304,13 @@ def test_full_forward_and_backward_t5_small(batch_size: int = 2):
assert not torch.equal(original_params, updated_params)


@pytest.mark.gpu
@pytest.mark.parametrize(
'attn_impl,precision',
[('torch', torch.float16), ('torch', torch.bfloat16),
pytest.param('flash', torch.float16, marks=pytest.mark.gpu),
pytest.param('flash', torch.bfloat16, marks=pytest.mark.gpu)])
def test_determinism(attn_impl: str, precision: torch.dtype):
if not torch.cuda.is_available():
pytest.skip(
'This test requires CUDA to be available in order to run with bfloat16 precision.'
)

conf_path = 'scripts/train/yamls/pretrain/testing.yaml'
with open(conf_path) as f:
test_cfg = om.load(f)
Expand Down Expand Up @@ -519,10 +515,12 @@ def test_mpt_creation(norm_type: str, no_bias: bool, tie_word_embeddings: bool):
assert block.resid_ffn_dropout.p == 0.2


@pytest.mark.parametrize('attention_impl,device', [('torch', 'cpu'),
('flash', 'gpu'),
('triton', 'gpu'),
('torch', 'gpu')])
@pytest.mark.parametrize('attention_impl', [
'torch',
pytest.param('flash', marks=pytest.mark.gpu),
pytest.param('triton', marks=pytest.mark.gpu),
pytest.param('torch', marks=pytest.mark.gpu)
])
@pytest.mark.parametrize('pos_emb_config', [{
'alibi': False,
'rope': False
Expand Down Expand Up @@ -550,24 +548,20 @@ def test_mpt_creation(norm_type: str, no_bias: bool, tie_word_embeddings: bool):
},
}])
@pytest.mark.parametrize('tie_word_embeddings', [True, False])
def test_forward_with_padding(attention_impl: str, device: str,
pos_emb_config: dict, tie_word_embeddings: bool):
def test_forward_with_padding(attention_impl: str, pos_emb_config: dict,
tie_word_embeddings: bool):
# Test that different placement of padding does not affect the output.
if not torch.cuda.is_available() and device == 'gpu':
pytest.skip(
f'This test requires CUDA to be available in order to run with {attention_impl} attention.'
)
alibi = pos_emb_config['alibi']
if alibi and attention_impl == 'flash':
pytest.skip(f'alibi only implemented with torch and triton attention.')

rope = pos_emb_config['rope']
if rope and pos_emb_config['rope_impl'] == 'dail' and (
device != 'gpu' or not is_flash_v2_installed()):
if rope and pos_emb_config[
'rope_impl'] == 'dail' and not is_flash_v2_installed():
pytest.skip(
f'dail implementation of rope requires gpu and flash attention 2.')

composer_device = get_device(device)
composer_device = get_device(None)

hf_config = MPTConfig(
init_device='cpu',
Expand Down Expand Up @@ -743,12 +737,12 @@ def test_advanced_mask_building(attention_impl: str):
assert torch.equal(attn_bias, expected_attn_bias)


@pytest.mark.parametrize('attention_impl,device,precision', [
('torch', 'cpu', 'fp32'),
('flash', 'gpu', 'amp_bf16'),
('triton', 'gpu', 'amp_bf16'),
('torch', 'gpu', 'amp_bf16'),
('torch', 'gpu', 'fp32'),
@pytest.mark.parametrize('attention_impl,precision', [
('torch', 'fp32'),
pytest.param('flash', 'amp_bf16', marks=pytest.mark.gpu),
pytest.param('triton', 'amp_bf16', marks=pytest.mark.gpu),
pytest.param('torch', 'amp_bf16', marks=pytest.mark.gpu),
pytest.param('torch', 'fp32', marks=pytest.mark.gpu),
])
@pytest.mark.parametrize('pos_emb_config', [{
'alibi': False,
Expand Down Expand Up @@ -777,25 +771,21 @@ def test_advanced_mask_building(attention_impl: str):
},
}])
@pytest.mark.parametrize('tie_word_embeddings', [True, False])
def test_generate(attention_impl: str, device: str, precision: str,
pos_emb_config: dict, tie_word_embeddings: bool):
def test_generate(attention_impl: str, precision: str, pos_emb_config: dict,
tie_word_embeddings: bool):
# Test that generate works, and produces the same output with or without
# padding in the input.
if not torch.cuda.is_available() and device == 'gpu':
pytest.skip(
f'This test requires CUDA to be available in order to run with {attention_impl} attention.'
)
if pos_emb_config['alibi'] and attention_impl == 'flash':
pytest.skip(f'alibi only implemented with torch and triton attention.')

if pos_emb_config['rope'] and pos_emb_config['rope_impl'] == 'dail' and (
device != 'gpu' or not is_flash_v2_installed()):
if pos_emb_config['rope'] and pos_emb_config[
'rope_impl'] == 'dail' and not is_flash_v2_installed():
pytest.skip(
f'dail implementation of rope requires gpu and flash attention 2.')
if attention_impl == 'torch' and precision == 'amp_bf16' and tie_word_embeddings == False:
pytest.skip(f'This test configuration has precision / sampling issues.')

composer_device = get_device(device)
composer_device = get_device(None)

hf_config = MPTConfig(
init_device='cpu',
Expand Down Expand Up @@ -878,8 +868,6 @@ def test_generate(attention_impl: str, device: str, precision: str,
@pytest.mark.parametrize('tie_word_embeddings', [True, False])
def test_generate_with_device_map(tmp_path: pathlib.Path, world_size: int,
use_cache: bool, tie_word_embeddings: bool):
if not torch.cuda.is_available():
pytest.skip(f'This test requires CUDA to be available.')
if not torch.cuda.device_count() >= world_size:
pytest.skip(f'This test requires {world_size} GPUs.')

Expand Down Expand Up @@ -978,11 +966,11 @@ def test_save_from_pretrained(tmp_path: pathlib.Path):
check_hf_model_equivalence(mpt, mpt2)


@pytest.mark.parametrize('attn_impl,device', [
('torch', 'cpu'),
('flash', 'gpu'),
('triton', 'gpu'),
('torch', 'gpu'),
@pytest.mark.parametrize('attn_impl', [
'torch',
pytest.param('flash', marks=pytest.mark.gpu),
pytest.param('triton', marks=pytest.mark.gpu),
pytest.param('torch', marks=pytest.mark.gpu),
])
@pytest.mark.parametrize('pos_emb_config', [{
'alibi': False,
Expand Down Expand Up @@ -1011,22 +999,17 @@ def test_save_from_pretrained(tmp_path: pathlib.Path):
},
}])
@pytest.mark.parametrize('tie_word_embeddings', [True, False])
def test_forward_with_cache_and_padding(attn_impl: str, device: str,
pos_emb_config: dict,
def test_forward_with_cache_and_padding(attn_impl: str, pos_emb_config: dict,
tie_word_embeddings: bool):
# Tests that the result is the same with or without padding when using kv caching
if not torch.cuda.is_available() and device == 'gpu':
pytest.skip(
f'This test requires CUDA to be available in order to run with {attn_impl} attention.'
)
if pos_emb_config['alibi'] and attn_impl == 'flash':
pytest.skip(f'alibi only implemented with torch and triton attention.')
if pos_emb_config['rope'] and pos_emb_config['rope_impl'] == 'dail' and (
device != 'gpu' or not is_flash_v2_installed()):
if pos_emb_config['rope'] and pos_emb_config[
'rope_impl'] == 'dail' and not is_flash_v2_installed():
pytest.skip(
f'dail implementation of rope requires gpu and flash attention 2.')

composer_device = get_device(device)
composer_device = get_device(None)

hf_config = MPTConfig(
init_device='cpu',
Expand Down Expand Up @@ -1120,11 +1103,11 @@ def test_forward_with_cache_and_padding(attn_impl: str, device: str,
rtol=1e-6)


@pytest.mark.parametrize('attn_impl,device', [
('torch', 'cpu'),
('flash', 'gpu'),
('triton', 'gpu'),
('torch', 'gpu'),
@pytest.mark.parametrize('attn_impl', [
'torch',
pytest.param('flash', marks=pytest.mark.gpu),
pytest.param('triton', marks=pytest.mark.gpu),
pytest.param('torch', marks=pytest.mark.gpu),
])
@pytest.mark.parametrize('pos_emb_config', [{
'alibi': False,
Expand Down Expand Up @@ -1153,23 +1136,19 @@ def test_forward_with_cache_and_padding(attn_impl: str, device: str,
},
}])
@pytest.mark.parametrize('tie_word_embeddings', [True, False])
def test_forward_with_cache(attn_impl: str, device: str, pos_emb_config: dict,
def test_forward_with_cache(attn_impl: str, pos_emb_config: dict,
tie_word_embeddings: bool):
# Test that model forward with and without the key-value cache produces the
# same output.
if not torch.cuda.is_available() and device == 'gpu':
pytest.skip(
f'This test requires CUDA to be available in order to run with {attn_impl} attention.'
)
if pos_emb_config['alibi'] and attn_impl == 'flash':
pytest.skip(f'alibi only implemented with torch and triton attention.')

if pos_emb_config['rope'] and pos_emb_config['rope_impl'] == 'dail' and (
device != 'gpu' or not is_flash_v2_installed()):
if pos_emb_config['rope'] and pos_emb_config[
'rope_impl'] == 'dail' and not is_flash_v2_installed():
pytest.skip(
f'dail implementation of rope requires gpu and flash attention 2.')

composer_device = get_device(device)
composer_device = get_device(None)

hf_config = MPTConfig(
init_device='cpu',
Expand Down Expand Up @@ -1264,11 +1243,11 @@ def test_forward_with_cache(attn_impl: str, device: str, pos_emb_config: dict,
)


@pytest.mark.parametrize('attn_impl,device', [
('torch', 'cpu'),
('flash', 'gpu'),
('triton', 'gpu'),
('torch', 'gpu'),
@pytest.mark.parametrize('attn_impl', [
'torch',
pytest.param('flash', marks=pytest.mark.gpu),
pytest.param('triton', marks=pytest.mark.gpu),
pytest.param('torch', marks=pytest.mark.gpu),
])
@pytest.mark.parametrize('pos_emb_config', [{
'alibi': False,
Expand Down Expand Up @@ -1297,20 +1276,16 @@ def test_forward_with_cache(attn_impl: str, device: str, pos_emb_config: dict,
},
}])
@pytest.mark.parametrize('tie_word_embeddings', [True, False])
def test_generate_with_past_kv(attn_impl: str, device: str,
pos_emb_config: dict, tie_word_embeddings: bool):
if not torch.cuda.is_available() and device == 'gpu':
pytest.skip(
f'This test requires CUDA to be available in order to run with {attn_impl} attention.'
)
def test_generate_with_past_kv(attn_impl: str, pos_emb_config: dict,
tie_word_embeddings: bool):
if pos_emb_config['alibi'] and attn_impl == 'flash':
pytest.skip(f'alibi only implemented with torch and triton attention.')
if pos_emb_config['rope'] and pos_emb_config['rope_impl'] == 'dail' and (
device != 'gpu' or not is_flash_v2_installed()):
if pos_emb_config['rope'] and pos_emb_config[
'rope_impl'] == 'dail' and not is_flash_v2_installed():
pytest.skip(
f'dail implementation of rope requires gpu and flash attention 2.')

composer_device = get_device(device)
composer_device = get_device(None)

hf_config = MPTConfig(
init_device='cpu',
Expand Down Expand Up @@ -1368,11 +1343,11 @@ def test_generate_with_past_kv(attn_impl: str, device: str,
hf_config.d_model)


@pytest.mark.parametrize('attn_impl,device', [
('torch', 'cpu'),
('flash', 'gpu'),
('triton', 'gpu'),
('torch', 'gpu'),
@pytest.mark.parametrize('attn_impl', [
'torch',
pytest.param('flash', marks=pytest.mark.gpu),
pytest.param('triton', marks=pytest.mark.gpu),
pytest.param('torch', marks=pytest.mark.gpu),
])
@pytest.mark.parametrize('generation_kwargs', [{
'max_new_tokens': 2,
Expand Down Expand Up @@ -1412,24 +1387,22 @@ def test_generate_with_past_kv(attn_impl: str, device: str,
},
}])
@pytest.mark.parametrize('tie_word_embeddings', [True, False])
def test_generation_kwargs_dont_crash(attn_impl: str, device: str,
def test_generation_kwargs_dont_crash(attn_impl: str,
generation_kwargs: Dict[str, Any],
pos_emb_config: dict,
tie_word_embeddings: bool):
if not torch.cuda.is_available() and device == 'gpu':
pytest.skip(
f'This test requires CUDA to be available in order to run with {attn_impl} attention.'
)
if pos_emb_config['alibi'] and attn_impl == 'flash':
pytest.skip(f'alibi only implemented with torch and triton attention.')

if pos_emb_config['rope'] and pos_emb_config['rope_impl'] == 'dail' and (
device != 'gpu' or not is_flash_v2_installed()):
if pos_emb_config['rope'] and pos_emb_config[
'rope_impl'] == 'dail' and not is_flash_v2_installed():
pytest.skip(
f'dail implementation of rope requires gpu and flash attention 2.')
composer_device = get_device(device)
if device == 'gpu': # Switch deteminism off
composer_device = get_device(None)

if composer_device.name == 'gpu':
torch.use_deterministic_algorithms(False)

hf_config = MPTConfig(
init_device='cpu',
d_model=128,
Expand Down Expand Up @@ -1463,7 +1436,8 @@ def test_generation_kwargs_dont_crash(attn_impl: str, device: str,
_ = mpt.generate(input_ids=no_padding_input_ids,
attention_mask=no_padding_attention_mask,
**generation_kwargs)
if device == 'gpu': # Switch deteminism back on

if composer_device.name == 'gpu':
reproducibility.configure_deterministic_mode()


Expand Down Expand Up @@ -1499,10 +1473,6 @@ def test_generation_kwargs_dont_crash(attn_impl: str, device: str,
def test_model_to(attention_impl: str, pos_emb_config: dict,
tie_word_embeddings: bool):
# test that moving the model to diff devices and dtypes in diff ways does not break the model
if not torch.cuda.is_available():
pytest.skip(
f'This test requires CUDA to be available in order to run with {attention_impl} attention.'
)
if pos_emb_config['alibi'] and attention_impl == 'flash':
pytest.skip(f'alibi only implemented with torch and triton attention.')

Expand Down Expand Up @@ -1597,11 +1567,11 @@ def test_alibi_vs_hf():
torch.testing.assert_close(alibi_bias_hf, alibi_bias_m)


@pytest.mark.parametrize('attn_impl,device', [
('torch', 'cpu'),
('flash', 'gpu'),
('triton', 'gpu'),
('torch', 'gpu'),
@pytest.mark.parametrize('attn_impl', [
'torch',
pytest.param('flash', marks=pytest.mark.gpu),
pytest.param('triton', marks=pytest.mark.gpu),
pytest.param('torch', marks=pytest.mark.gpu),
])
@pytest.mark.parametrize('pos_emb_config', [{
'alibi': False,
Expand Down Expand Up @@ -1633,24 +1603,19 @@ def test_alibi_vs_hf():
@pytest.mark.parametrize('output_hidden_states', [True, False])
@pytest.mark.parametrize('tie_word_embeddings', [True, False])
def test_forward_with_output_attentions_and_output_hidden_states(
attn_impl: str, device: str, pos_emb_config: dict,
output_attentions: bool, output_hidden_states: bool,
tie_word_embeddings: bool):
attn_impl: str, pos_emb_config: dict, output_attentions: bool,
output_hidden_states: bool, tie_word_embeddings: bool):
# Test that model forward with output_attentions_and_output_hidden_states
if not torch.cuda.is_available() and device == 'gpu':
pytest.skip(
f'This test requires CUDA to be available in order to run with {attn_impl} attention.'
)
if pos_emb_config['alibi'] and attn_impl == 'flash':
pytest.skip(f'alibi only implemented with torch and triton attention.')
if output_attentions and attn_impl in ['flash', 'triton']:
pytest.skip(f'output_attentions only implemented with torch attention.')
if pos_emb_config['rope'] and pos_emb_config['rope_impl'] == 'dail' and (
device != 'gpu' or not is_flash_v2_installed()):
if pos_emb_config['rope'] and pos_emb_config[
'rope_impl'] == 'dail' and not is_flash_v2_installed():
pytest.skip(
f'dail implementation of rope requires gpu and flash attention 2.')

composer_device = get_device(device)
composer_device = get_device(None)

n_layers = 2

Expand Down Expand Up @@ -1708,8 +1673,6 @@ def test_hf_init(tmp_path: pathlib.Path,
init_device: str,
world_size: int,
batch_size: int = 1):
if not torch.cuda.is_available():
pytest.skip(f'This test requires CUDA to be available.')
if not torch.cuda.device_count() >= world_size:
pytest.skip(f'This test requires {world_size} GPUs.')

Expand Down
Loading