Skip to content

Commit

Permalink
add longformer
Browse files Browse the repository at this point in the history
  • Loading branch information
reiase committed Oct 8, 2023
1 parent 14bbdc0 commit 7e57189
Show file tree
Hide file tree
Showing 23 changed files with 772 additions and 0 deletions.
49 changes: 49 additions & 0 deletions training/benchmarks/longformer/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
## Model Introduction
### longformer-base-4096 model

longformer-base-4096 is a BERT-like model started from the RoBERTa checkpoint and pretrained for MLM on long documents.
It supports sequences of length up to 4,096. [longformer-base-4096 model](https://huggingface.co/allenai/longformer-base-4096). It was
introduced in [this paper](https://arxiv.org/abs/2004.05150). The code for longformer process can be found
[here](https://github.com/huggingface/transformers/tree/main/examples/research_projects/longform-qa).

## Model and Training Scripts source code
Pytorch case:
This repository includes software from https://github.com/huggingface/transformers/tree/v4.33.0
licensed under the Apache License 2.0.

Some of the files in this directory were modified by BAAI in 2023 to support FlagPerf.

## Dataset and Model Checkpoints

> Dataset website:https://huggingface.co/datasets/enwik8
https://huggingface.co/allenai/longformer-base-4096
> Model checkpoint website: https://huggingface.co/allenai/longformer-base-4096
We have already preprocessed the dataset and the model checkpoint files(The preprocessing script is `training/benchmarks/longformer/pytorch/data_preprocessing/create_train_eval_data.py`).
The preprocessed can be downloaded directly from https://bd.bcebos.com/klx-pytorch-ipipe-bd/flagperf/datasets/longformer_train.tar.
No additional preprocessing steps need to be conducted.
After decompressing, the dataset and model checkpoint files are organized as the following:

```
longformer_train
|-- dataset
| |-- eval_dataset.npz
| `-- train_dataset.npz
`-- model
|-- config.json
|-- merges.txt
|-- pytorch_model.bin
|-- tokenizer.json
`-- vocab.json
```

## Benchmark Task and Target Accuracy

This experiment is to finetune a text classification task on wikitext dataset with longformer-base-4096 pretrained checkpoints.
After finetuning 10 epoches, the longformer-base-4096 model is able to achieve accuracy score of 90+, which matches the evaluation result on the [report](https://huggingface.co/allenai/longformer-base-4096).

## AI Frameworks && Accelerators supports

| | Pytorch | Paddle | TensorFlow2 |
| ---------- | ------- | ------ | ----------- |
| Nvidia GPU | [](../../nvidia/longformer-pytorch/README.md) | N/A | N/A |
2 changes: 2 additions & 0 deletions training/benchmarks/longformer/pytorch/config/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
from ._base import *
from .mutable_params import mutable_params
53 changes: 53 additions & 0 deletions training/benchmarks/longformer/pytorch/config/_base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
# DO NOT MODIFY THESE REQUIRED PARAMETERS

# Required parameters
vendor: str = None
data_dir: str = None
name: str = "longformer-base-4096"
cudnn_benchmark: bool = False
cudnn_deterministic: bool = True

# Optional parameters

# =========================================================
# loss scale
# =========================================================
lr: float = 5e-5
weight_decay = 0.01

# =========================================================
# train && evaluate
# =========================================================
train_batch_size: int = 16
eval_batch_size: int = 16

max_epoch: int = 1000
target_acc: float = 0.64

do_train = True
distributed: bool = True


# =========================================================
# utils
# =========================================================
seed: int = 42
dist_backend: str = 'nccl'
device: str = None

# =========================================================
# datasets
# =========================================================
dataloader_drop_last: bool = False
dataloader_num_workers: int = 8

# =========================================================
# for driver
# =========================================================
local_rank: int = -1
use_env: bool = True
log_freq: int = 100
print_freq: int = 100
n_device: int = 1
sync_bn: bool = False
gradient_accumulation_steps: int = 1
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
mutable_params = [
'vendor', 'data_dir', 'lr', 'weight_decay', 'train_batch_size',
'gradient_accumulation_steps', 'eval_batch_size', 'do_train',
'distributed', 'dist_backend', 'device', 'cudnn_benchmark',
'cudnn_deterministic'
]
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
import os
import numpy as np
from itertools import chain
from typing import List
import datasets
from datasets import load_dataset
from transformers import LongformerTokenizer, DataCollatorForLanguageModeling

def save_dataset(ds, save_path):
np.savez(save_path,
input_ids=ds['input_ids'],
attention_mask=ds['attention_mask'],
special_tokens_mask=ds['special_tokens_mask'])

def main():
data_prefix = 'longformer_train/dataset'
os.makedirs(data_prefix, exist_ok=True)
train_datapath = os.path.join(data_prefix, 'train_dataset.npz')
eval_datapath = os.path.join(data_prefix, 'eval_dataset.npz')

tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096', use_auto_token=False)
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)
raw_datasets = load_dataset("wikitext", "wikitext-2-raw-v1")
column_names = list(raw_datasets["train"].features)
text_column_name = "text" if "text" in column_names else column_names[0]
max_seq_length = tokenizer.model_max_length
if max_seq_length > 1024:
max_seq_length = 1024

def tokenize_function(examples):
return tokenizer(examples[text_column_name], return_special_tokens_mask=True)

tokenized_datasets = raw_datasets.map(
tokenize_function,
batched=True,
num_proc=None,
remove_columns=[text_column_name],
load_from_cache_file=True,
desc="Running tokenizer on dataset line_by_line",
)

def group_texts(examples):
# Concatenate all texts.
concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
total_length = len(concatenated_examples[list(examples.keys())[0]])
# We drop the small remainder, and if the total_length < max_seq_length we exclude this batch and return an empty dict.
# We could add padding if the model supported it instead of this drop, you can customize this part to your needs.
total_length = (total_length // max_seq_length) * max_seq_length
# Split by chunks of max_len.
result = {
k: [t[i : i + max_seq_length] for i in range(0, total_length, max_seq_length)]
for k, t in concatenated_examples.items()
}
return result

tokenized_datasets = tokenized_datasets.map(
group_texts,
batched=True,
num_proc=None,
load_from_cache_file=True,
desc=f"Grouping texts in chunks of {max_seq_length}",
)
train_dataset = tokenized_datasets['train'].with_format('numpy')
save_dataset(train_dataset, train_datapath)

validation_dataset = tokenized_datasets['validation'].with_format('numpy')
save_dataset(validation_dataset, eval_datapath)

if __name__ == "__main__":
main()
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from .dataloader import build_train_dataloader, build_eval_dataloader
72 changes: 72 additions & 0 deletions training/benchmarks/longformer/pytorch/dataloaders/dataloader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
import os
import numpy as np
import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import datasets
from transformers import LongformerTokenizer, DataCollatorForLanguageModeling

class LongformerDataset(Dataset):
def __init__(self, filepath):
origin_data = np.load(filepath)
self.input_ids = origin_data['input_ids']
self.special_tokens_mask = origin_data['special_tokens_mask']
self.attention_mask = origin_data['attention_mask']

def __len__(self):
return len(self.input_ids)

def __getitem__(self, idx):
sample = {
'input_ids': self.input_ids[idx],
'special_tokens_mask': self.special_tokens_mask[idx],
'attention_mask': self.attention_mask[idx]
}
return sample

def get_data_collator(config):
model_path = os.path.join(config.data_dir, 'model')
tokenizer = LongformerTokenizer.from_pretrained(model_path)
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)
return data_collator

def build_train_sampler(config, dataset):
if torch.distributed.is_initialized():
world_size = torch.distributed.get_world_size()
rank = torch.distributed.get_rank()
sampler = torch.utils.data.distributed.DistributedSampler(
dataset, num_replicas=world_size, rank=rank, seed = config.seed)
else:
generator = torch.Generator()
generator.manual_seed(config.seed)
sampler = torch.utils.data.RandomSampler(dataset, generator=generator)
return sampler

def build_train_dataloader(config):
data_collator = get_data_collator(config)
train_dataset = LongformerDataset(
os.path.join(config.data_dir, 'dataset', 'train_dataset.npz'))
train_sampler = build_train_sampler(config, train_dataset)
data_loader = DataLoader(
train_dataset,
sampler=train_sampler,
collate_fn=data_collator,
batch_size=config.train_batch_size,
drop_last=config.dataloader_drop_last,
num_workers=config.dataloader_num_workers,
)
return data_loader

def build_eval_dataloader(config):
data_collator = get_data_collator(config)
eval_dataset = LongformerDataset(
os.path.join(config.data_dir, 'dataset', 'eval_dataset.npz'))
data_loader = DataLoader(
eval_dataset,
collate_fn=data_collator,
batch_size=config.eval_batch_size,
drop_last=config.dataloader_drop_last,
num_workers=config.dataloader_num_workers,
)

return data_loader
14 changes: 14 additions & 0 deletions training/benchmarks/longformer/pytorch/model/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
import os
from transformers import (AutoConfig,
LongformerForMaskedLM,
LongformerTokenizerFast,
AutoModelForMaskedLM,
AutoTokenizer)


def create_model(config):
model_path = os.path.join(config.data_dir, 'model')
hfconfig = AutoConfig.from_pretrained(model_path)
model = LongformerForMaskedLM(hfconfig)
tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False)
return model, hfconfig, tokenizer
27 changes: 27 additions & 0 deletions training/benchmarks/longformer/pytorch/optimizers/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
import torch


def create_optimizer(model, args):
# Optimizer
# Split weights in two groups, one with weight decay and the other not.
no_decay = ["bias", "LayerNorm.weight", "layer_norm.weight"]
optimizer_grouped_parameters = [
{
"params": [
p for n, p in model.named_parameters()
if not any(nd in n for nd in no_decay)
],
"weight_decay":
args.weight_decay,
},
{
"params": [
p for n, p in model.named_parameters()
if any(nd in n for nd in no_decay)
],
"weight_decay":
0.0,
},
]
optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=args.lr)
return optimizer
Loading

0 comments on commit 7e57189

Please sign in to comment.