diff --git a/torchbenchmark/models/BERT_pytorch/bert_pytorch/model/embedding/position.py b/torchbenchmark/models/BERT_pytorch/bert_pytorch/model/embedding/position.py index d55c224b5c..0f615b6def 100644 --- a/torchbenchmark/models/BERT_pytorch/bert_pytorch/model/embedding/position.py +++ b/torchbenchmark/models/BERT_pytorch/bert_pytorch/model/embedding/position.py @@ -10,7 +10,8 @@ def __init__(self, d_model, max_len=512): # Compute the positional encodings once in log space. pe = torch.zeros(max_len, d_model).float() - pe.require_grad = False + # Changed from upstream, see https://github.com/codertimo/BERT-pytorch/pull/104 + pe.requires_grad = False position = torch.arange(0, max_len).float().unsqueeze(1) div_term = (torch.arange(0, d_model, 2).float() * -(math.log(10000.0) / d_model)).exp()