Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update BLOOM parameter counts #18531

Merged
merged 2 commits into from
Aug 12, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 5 additions & 5 deletions docs/source/en/model_doc/bloom.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,11 @@ The BLOOM model has been proposed with its various versions through the [BigScie
The architecture of BLOOM is essentially similar to GPT3 (auto-regressive model for next token prediction), but has been trained on 46 different languages and 13 programming languages.
Several smaller versions of the models have been trained on the same dataset. BLOOM is available in the following versions:

- [bloom-350m](https://huggingface.co/bigscience/bloom-350m)
- [bloom-760m](https://huggingface.co/bigscience/bloom-760m)
- [bloom-1b3](https://huggingface.co/bigscience/bloom-1b3)
- [bloom-2b5](https://huggingface.co/bigscience/bloom-2b5)
- [bloom-6b3](https://huggingface.co/bigscience/bloom-6b3)
- [bloom-560m](https://huggingface.co/bigscience/bloom-560m)
- [bloom-1b1](https://huggingface.co/bigscience/bloom-1b1)
- [bloom-1b7](https://huggingface.co/bigscience/bloom-1b7)
- [bloom-3b](https://huggingface.co/bigscience/bloom-3b)
- [bloom-7b1](https://huggingface.co/bigscience/bloom-7b1)
- [bloom](https://huggingface.co/bigscience/bloom) (176B parameters)


Expand Down
10 changes: 5 additions & 5 deletions src/transformers/models/bloom/configuration_bloom.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,11 +31,11 @@

BLOOM_PRETRAINED_CONFIG_ARCHIVE_MAP = {
"bigscience/bloom": "https://huggingface.co/bigscience/bloom/resolve/main/config.json",
"bigscience/bloom-350m": "https://huggingface.co/bigscience/bloom-350m/blob/main/config.json",
"bigscience/bloom-760m": "https://huggingface.co/bigscience/bloom-760m/blob/main/config.json",
"bigscience/bloom-1b3": "https://huggingface.co/bigscience/bloom-1b3/blob/main/config.json",
"bigscience/bloom-2b5": "https://huggingface.co/bigscience/bloom-2b5/blob/main/config.json",
"bigscience/bloom-6b3": "https://huggingface.co/bigscience/bloom-6b3/blob/main/config.json",
"bigscience/bloom-560m": "https://huggingface.co/bigscience/bloom-560m/blob/main/config.json",
"bigscience/bloom-1b1": "https://huggingface.co/bigscience/bloom-1b1/blob/main/config.json",
"bigscience/bloom-1b7": "https://huggingface.co/bigscience/bloom-1b7/blob/main/config.json",
"bigscience/bloom-3b": "https://huggingface.co/bigscience/bloom-3b/blob/main/config.json",
"bigscience/bloom-7b1": "https://huggingface.co/bigscience/bloom-7b1/blob/main/config.json",
}


Expand Down
12 changes: 6 additions & 6 deletions src/transformers/models/bloom/modeling_bloom.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,17 +38,17 @@

logger = logging.get_logger(__name__)

_CHECKPOINT_FOR_DOC = "bigscience/bloom-350m"
_CHECKPOINT_FOR_DOC = "bigscience/bloom-560m"
_CONFIG_FOR_DOC = "BloomConfig"
_TOKENIZER_FOR_DOC = "BloomTokenizerFast"

BLOOM_PRETRAINED_MODEL_ARCHIVE_LIST = [
"bigscience/bigscience-small-testing",
"bigscience/bloom-350m",
"bigscience/bloom-760m",
"bigscience/bloom-1b3",
"bigscience/bloom-2b5",
"bigscience/bloom-6b3",
"bigscience/bloom-560m",
"bigscience/bloom-1b1",
"bigscience/bloom-1b7",
"bigscience/bloom-3b",
"bigscience/bloom-7b1",
"bigscience/bloom",
]

Expand Down
10 changes: 5 additions & 5 deletions src/transformers/models/bloom/tokenization_bloom_fast.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,11 +36,11 @@
PRETRAINED_VOCAB_FILES_MAP = {
"tokenizer_file": {
"bigscience/tokenizer": "https://huggingface.co/bigscience/tokenizer/blob/main/tokenizer.json",
"bigscience/bloom-350m": "https://huggingface.co/bigscience/bloom-350m/blob/main/tokenizer.json",
"bigscience/bloom-760m": "https://huggingface.co/bigscience/bloom-760m/blob/main/tokenizer.json",
"bigscience/bloom-1b3": "https://huggingface.co/bigscience/bloom-1b3/blob/main/tokenizer.json",
"bigscience/bloom-2b5": "https://huggingface.co/bigscience/bloom-2b5/blob/main/tokenizer.json",
"bigscience/bloom-6b3": "https://huggingface.co/bigscience/bloom-2b5/blob/main/tokenizer.json",
"bigscience/bloom-560m": "https://huggingface.co/bigscience/bloom-560m/blob/main/tokenizer.json",
"bigscience/bloom-1b1": "https://huggingface.co/bigscience/bloom-1b1/blob/main/tokenizer.json",
"bigscience/bloom-1b7": "https://huggingface.co/bigscience/bloom-1b7/blob/main/tokenizer.json",
"bigscience/bloom-3b": "https://huggingface.co/bigscience/bloom-3b/blob/main/tokenizer.json",
"bigscience/bloom-7b1": "https://huggingface.co/bigscience/bloom-7b1/blob/main/tokenizer.json",
"bigscience/bloom": "https://huggingface.co/bigscience/bloom/blob/main/tokenizer.json",
},
}
Expand Down
34 changes: 17 additions & 17 deletions tests/models/bloom/test_modeling_bloom.py
Original file line number Diff line number Diff line change
Expand Up @@ -379,27 +379,27 @@ def test_model_from_pretrained(self):
def test_simple_generation(self):
# This test is a bit flaky. For some GPU architectures, pytorch sets by default allow_fp16_reduced_precision_reduction = True and some operations
# do not give the same results under this configuration, especially torch.baddmm and torch.bmm. https://pytorch.org/docs/stable/notes/numerical_accuracy.html#fp16-on-mi200
# As we leave the default value (True) for allow_fp16_reduced_precision_reduction , the tests failed when running in half-precision with smaller models (350m)
# As we leave the default value (True) for allow_fp16_reduced_precision_reduction , the tests failed when running in half-precision with smaller models (560m)
# Please see: https://pytorch.org/docs/stable/notes/cuda.html#reduced-precision-reduction-in-fp16-gemms
# This discrepancy is observed only when using small models and seems to be stable for larger models.
# Our conclusion is that these operations are flaky for small inputs but seems to be stable for larger inputs (for the functions `baddmm` and `bmm`), and therefore for larger models.

# Here is a summary of an ablation study of our observations
# EXPECTED_OUTPUT = "I enjoy walking with my cute dog, and I love to watch the kids play. I am a very active person, and I am a very good listener. I am a very good person, and I am a very good person. I am a"
# 350m + allow_fp16_reduced_precision_reduction = False + torch.bmm ==> PASS
# 350m + allow_fp16_reduced_precision_reduction = False + torch.baddm ==> PASS
# 350m + allow_fp16_reduced_precision_reduction = True + torch.baddm ==> PASS
# 350m + allow_fp16_reduced_precision_reduction = True + torch.bmm ==> FAIL
# 560m + allow_fp16_reduced_precision_reduction = False + torch.bmm ==> PASS
# 560m + allow_fp16_reduced_precision_reduction = False + torch.baddm ==> PASS
# 560m + allow_fp16_reduced_precision_reduction = True + torch.baddm ==> PASS
# 560m + allow_fp16_reduced_precision_reduction = True + torch.bmm ==> FAIL

# EXPECTED_OUTPUT = "I enjoy walking with my cute dog, but I also enjoy hiking, biking, and swimming. I love to cook and bake. I love to cook and bake. I love to cook and bake. I love to cook and bake. I love"
# >=760m + allow_fp16_reduced_precision_reduction = True + torch.baddm ==> PASS (for use_cache=True and use_cache=False)
# >=760m + allow_fp16_reduced_precision_reduction = True + torch.bmm ==> PASS
# >=760m + allow_fp16_reduced_precision_reduction = False + torch.bmm ==> PASS
# >=1b1 + allow_fp16_reduced_precision_reduction = True + torch.baddm ==> PASS (for use_cache=True and use_cache=False)
# >=1b1 + allow_fp16_reduced_precision_reduction = True + torch.bmm ==> PASS
# >=1b1 + allow_fp16_reduced_precision_reduction = False + torch.bmm ==> PASS

path_350m = "bigscience/bloom-350m"
model = BloomForCausalLM.from_pretrained(path_350m, use_cache=True, revision="gs555750").cuda()
path_560m = "bigscience/bloom-560m"
model = BloomForCausalLM.from_pretrained(path_560m, use_cache=True, revision="gs555750").cuda()
model = model.eval()
tokenizer = BloomTokenizerFast.from_pretrained(path_350m)
tokenizer = BloomTokenizerFast.from_pretrained(path_560m)

input_sentence = "I enjoy walking with my cute dog"
# This output has been obtained using fp32 model on the huggingface DGX workstation - NVIDIA A100 GPU
Expand All @@ -416,10 +416,10 @@ def test_simple_generation(self):
@slow
@require_torch_gpu
def test_batch_generation(self):
path_350m = "bigscience/bloom-350m"
model = BloomForCausalLM.from_pretrained(path_350m, use_cache=True, revision="gs555750").cuda()
path_560m = "bigscience/bloom-560m"
model = BloomForCausalLM.from_pretrained(path_560m, use_cache=True, revision="gs555750").cuda()
model = model.eval()
tokenizer = BloomTokenizerFast.from_pretrained(path_350m, padding_side="left")
tokenizer = BloomTokenizerFast.from_pretrained(path_560m, padding_side="left")

input_sentence = ["I enjoy walking with my cute dog", "I enjoy walking with my cute dog"]

Expand All @@ -437,10 +437,10 @@ def test_batch_generation(self):
@require_torch_gpu
def test_batch_generation_padd(self):

path_350m = "bigscience/bloom-350m"
model = BloomForCausalLM.from_pretrained(path_350m, use_cache=True, revision="gs555750").cuda()
path_560m = "bigscience/bloom-560m"
model = BloomForCausalLM.from_pretrained(path_560m, use_cache=True, revision="gs555750").cuda()
model = model.eval()
tokenizer = BloomTokenizerFast.from_pretrained(path_350m, padding_side="left")
tokenizer = BloomTokenizerFast.from_pretrained(path_560m, padding_side="left")

input_sentence = ["I enjoy walking with my cute dog", "Hello my name is"]
input_sentence_without_pad = "Hello my name is"
Expand Down
2 changes: 1 addition & 1 deletion tests/onnx/test_onnx_v2.py
Original file line number Diff line number Diff line change
Expand Up @@ -213,7 +213,7 @@ def test_values_override(self):
}

PYTORCH_EXPORT_WITH_PAST_MODELS = {
("bloom", "bigscience/bloom-350m"),
("bloom", "bigscience/bloom-560m"),
("gpt2", "gpt2"),
("gpt-neo", "EleutherAI/gpt-neo-125M"),
}
Expand Down