Skip to content

Commit

Permalink
Merge branch 'tokenizer_args' into 'main'
Browse files Browse the repository at this point in the history
Keep tokenization args in sync between tools/ and training/

See merge request ADLR/megatron-lm!2317
  • Loading branch information
deepakn94 committed Nov 9, 2024
2 parents 0343d03 + 5ebcc5a commit 1b8fce7
Show file tree
Hide file tree
Showing 2 changed files with 39 additions and 47 deletions.
66 changes: 36 additions & 30 deletions megatron/training/arguments.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ def parse_args(extra_args_provider=None, ignore_unknown_args=False):
parser = _add_distributed_args(parser)
parser = _add_validation_args(parser)
parser = _add_data_args(parser)
parser = _add_tokenizer_args(parser)
parser = _add_autoresume_args(parser)
parser = _add_biencoder_args(parser)
parser = _add_vision_args(parser)
Expand Down Expand Up @@ -1635,6 +1636,41 @@ def _add_validation_args(parser):
return parser


def _add_tokenizer_args(parser):
group = parser.add_argument_group(title='tokenizer')
group.add_argument('--vocab-size', type=int, default=None,
help='Size of vocab before EOD or padding.')
group.add_argument('--vocab-file', type=str, default=None,
help='Path to the vocab file.')
group.add_argument('--merge-file', type=str, default=None,
help='Path to the BPE merge file.')
group.add_argument('--vocab-extra-ids', type=int, default=0,
help='Number of additional vocabulary tokens. '
'They are used for span masking in the T5 model')
group.add_argument('--tokenizer-type', type=str,
default=None,
choices=['BertWordPieceLowerCase',
'BertWordPieceCase',
'GPT2BPETokenizer',
'SentencePieceTokenizer',
'GPTSentencePieceTokenizer',
'HuggingFaceTokenizer',
'Llama2Tokenizer',
'TikTokenizer',
'MultimodalTokenizer',
'NullTokenizer'],
help='What type of tokenizer to use.')
group.add_argument('--tokenizer-model', type=str, default=None,
help='Sentencepiece tokenizer model.')
group.add_argument('--tiktoken-pattern', type=str, default=None,
help='Which tiktoken pattern to use. Options: [v1, v2]')
group.add_argument('--tiktoken-num-special-tokens', type=int, default=1000,
help='Number of special tokens in tiktoken tokenizer')
group.add_argument('--tiktoken-special-tokens', type=str, nargs='+', default=None,
help='List of tiktoken special tokens, needs to have ["<unk>", "<s>", "</s>"]')
return parser


def _add_data_args(parser):
group = parser.add_argument_group(title='data and dataloader')

Expand Down Expand Up @@ -1673,15 +1709,6 @@ def _add_data_args(parser):
group.add_argument('--mock-data', action='store_true',
help='Skip data loading and validation and opt for artificial '
'generation of mock data when an implementation is available.')
group.add_argument('--vocab-size', type=int, default=None,
help='Size of vocab before EOD or padding.')
group.add_argument('--vocab-file', type=str, default=None,
help='Path to the vocab file.')
group.add_argument('--merge-file', type=str, default=None,
help='Path to the BPE merge file.')
group.add_argument('--vocab-extra-ids', type=int, default=0,
help='Number of additional vocabulary tokens. '
'They are used for span masking in the T5 model')
group.add_argument('--seq-length', type=int, default=None,
help='Maximum sequence length to process.')
group.add_argument('--encoder-seq-length', type=int, default=None,
Expand All @@ -1701,27 +1728,6 @@ def _add_data_args(parser):
help='Probability of producing a short sequence.')
group.add_argument('--num-workers', type=int, default=2,
help="Dataloader number of workers.")
group.add_argument('--tokenizer-type', type=str,
default=None,
choices=['BertWordPieceLowerCase',
'BertWordPieceCase',
'GPT2BPETokenizer',
'SentencePieceTokenizer',
'GPTSentencePieceTokenizer',
'HuggingFaceTokenizer',
'Llama2Tokenizer',
'TikTokenizer',
'MultimodalTokenizer',
'NullTokenizer'],
help='What type of tokenizer to use.')
group.add_argument('--tokenizer-model', type=str, default=None,
help='Sentencepiece tokenizer model.')
group.add_argument('--tiktoken-pattern', type=str, default=None,
help='Which tiktoken pattern to use. Options: [v1, v2]')
group.add_argument('--tiktoken-num-special-tokens', type=int, default=1000,
help='Number of special tokens in tiktoken tokenizer')
group.add_argument('--tiktoken-special-tokens', type=str, nargs='+', default=None,
help='List of tiktoken special tokens, needs to have ["<unk>", "<s>", "</s>"]')
group.add_argument('--reset-position-ids', action='store_true',
help='Reset posistion ids after end-of-document token.')
group.add_argument('--reset-attention-mask', action='store_true',
Expand Down
20 changes: 3 additions & 17 deletions tools/preprocess_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
nltk_available = False

from megatron.training.tokenizer import build_tokenizer
from megatron.training.arguments import _add_tokenizer_args
from megatron.core.datasets import indexed_dataset


Expand Down Expand Up @@ -188,6 +189,7 @@ def process_json_file(self, file_name):

def get_args():
parser = argparse.ArgumentParser()
parser = _add_tokenizer_args(parser)
group = parser.add_argument_group(title='input data')
group.add_argument('--input', type=str, required=True,
help='Path to input JSON')
Expand All @@ -197,30 +199,14 @@ def get_args():
help='Split documents into sentences.')
group.add_argument('--keep-newlines', action='store_true',
help='Keep newlines between sentences when splitting.')

group = parser.add_argument_group(title='tokenizer')
group.add_argument('--tokenizer-type', type=str, required=True,
choices=['BertWordPieceLowerCase','BertWordPieceCase',
'GPT2BPETokenizer', 'SentencePieceTokenizer',
'GPTSentencePieceTokenizer', 'Llama2Tokenizer',
'Llama3Tokenizer', 'MistralTokenizer', 'NullTokenizer'],
help='What type of tokenizer to use.')
group.add_argument('--tokenizer-model', type=str, default=None,
help='YTTM tokenizer model.')
group.add_argument('--vocab-file', type=str, default=None,
help='Path to the vocab file')
group.add_argument('--vocab-size', default=786,
help='size of vocab for use with NullTokenizer')
group.add_argument('--merge-file', type=str, default=None,
help='Path to the BPE merge file (if necessary).')
group = parser.add_argument_group(title='tokenization process')
group.add_argument('--append-eod', action='store_true',
help='Append an <eod> token to the end of a document.')
group.add_argument('--lang', type=str, default='english',
help='Language to use for NLTK-powered sentence splitting.')
group = parser.add_argument_group(title='output data')
group.add_argument('--output-prefix', type=str, required=True,
help='Path to binary output file without suffix')

group = parser.add_argument_group(title='runtime')
group.add_argument('--workers', type=int, required=True,
help=('Number of worker processes to launch.'
Expand Down

0 comments on commit 1b8fce7

Please sign in to comment.