Skip to content
Closed
Changes from all commits
Commits
Show all changes
76 commits
Select commit Hold shift + click to select a range
8b04e0e
ICT zeroshot evaluation code
mpatwary Mar 10, 2021
661553f
made more generic, aligned with other tasks
mpatwary Mar 11, 2021
43c9137
Fixed based on review recoemmendation
mpatwary Mar 19, 2021
4056539
fixed another issue
mpatwary Mar 19, 2021
a5acbf5
Merge branch 'main' into main_retriver_merge_ict_eval
mpatwary Mar 20, 2021
10ff060
implementing DPR
mpatwary Apr 9, 2021
cdde433
Merge branch 'main' into main_retriver_merge_dpr
mpatwary Apr 9, 2021
06076c7
implementation dpr
mpatwary Apr 23, 2021
957d1c9
Merge branch 'main' into main_retriver_merge_dpr
Apr 26, 2021
b9fcb7b
adding dpr code
Apr 29, 2021
8004731
removed commnets
Apr 29, 2021
f415dc8
removed commnets
Apr 29, 2021
a8d172b
removed commnets
Apr 29, 2021
220637f
DPR evaluation debugging
May 11, 2021
d2d5086
DPR ongoing
May 11, 2021
6d03d7a
DPR finetune and evaluation
May 12, 2021
f926720
fixing model evaluation of retriver
May 12, 2021
5409341
added pre ad post process
May 12, 2021
7e335e1
added pre ad post process
May 12, 2021
f64977f
evaluation works!
May 13, 2021
dca47cf
debugging DPR
May 14, 2021
3f75537
fix copy-n-paste error
stas00 May 17, 2021
07ca952
Typo fix in readme
devrimcavusoglu May 18, 2021
2dae74b
t5 fixes
stas00 May 18, 2021
4a09bb3
Merge branch 'main' into main_retriver_merge_dpr
mpatwary May 18, 2021
7a0710e
before cleaning the comments
mpatwary May 18, 2021
ccae9db
vit pipeline fixes
kvareddy May 18, 2021
2eaf6c7
cleaning the code
mpatwary May 18, 2021
2529380
additional cleaning
mpatwary May 19, 2021
8e44d61
renaming the folders
mpatwary May 19, 2021
113c636
Add temporary assert to finetuning until it can be fixed.
jaredcasper May 19, 2021
7577931
Fixed issues with ICT pretraining
mpatwary May 19, 2021
dfb6a9b
updated the evaluation script for retriver
mpatwary May 19, 2021
f21a662
updated the evaluation script for retriver
mpatwary May 19, 2021
a41e478
updated the evaluation script for retriver
mpatwary May 19, 2021
825375c
updated the evaluation script for retriver
mpatwary May 19, 2021
217f54b
Merge branch 'finetune_assert' into 'main'
shoeybi May 19, 2021
d078e54
added exit interval for finetuning
mpatwary May 20, 2021
63121a9
updating the scripts
mpatwary May 20, 2021
fda81a2
updating no load rng
mpatwary May 25, 2021
01fc083
Merge branch 'vit_pipeline_fixes' into 'main'
jaredcasper Jun 1, 2021
83c4d95
Merge branch 'main_retriver_merge_dpr' into 'main'
jaredcasper Jun 1, 2021
c7c65bb
updating script
mpatwary Jun 3, 2021
84eb016
Merge branch 'main' into main_retriver_merge_dpr
mpatwary Jun 3, 2021
3dadd16
Update T5 scripts
deepakn94 Jun 7, 2021
04c79f3
resolved hang issue
mpatwary Jun 8, 2021
ebfbfce
fixed the tensor size miss-mass issue
mpatwary Jun 9, 2021
e46f326
fixed the evaluation hangs
mpatwary Jun 10, 2021
a983cab
Adding readme
mpatwary Jun 10, 2021
d562d7b
Adding readme
mpatwary Jun 10, 2021
1095d7e
Adding readme
mpatwary Jun 10, 2021
bab5cc4
Adding readme
mpatwary Jun 10, 2021
8661ca2
Adding readme
mpatwary Jun 10, 2021
293554a
Adding readme
mpatwary Jun 10, 2021
e287bf0
Adding readme
mpatwary Jun 10, 2021
c45109e
Adding readme
mpatwary Jun 10, 2021
473127f
Clean up README.md a bit
jaredcasper Jun 10, 2021
2845047
addressed comments
mpatwary Jun 10, 2021
98113c6
Merge branch 'main_retriver_merge_dpr' of ssh://gitlab-master.nvidia.…
mpatwary Jun 10, 2021
598d7ee
Merge branch 'main_retriver_merge_dpr' into 'main'
jaredcasper Jun 10, 2021
2be1e51
Merge branch 't5_scripts' into 'main'
jaredcasper Jun 10, 2021
9d350c9
updated readme
mpatwary Jun 10, 2021
baf2e2a
updated readme
mpatwary Jun 10, 2021
32da2e7
updated readme
mpatwary Jun 10, 2021
4c92ca8
updated readme
mpatwary Jun 10, 2021
82b69e8
Merge branch 'main_retriver_merge_dpr' into 'main'
jaredcasper Jun 11, 2021
7898c9a
Merge branch 't5' of https://github.com/stas00/Megatron-LM into githu…
jaredcasper Jun 11, 2021
e1318f0
Merge branch 'typo-fix' of https://github.com/devrimcavusoglu/Megatro…
jaredcasper Jun 11, 2021
4a35d50
Merge branch 'patch-1' of https://github.com/stas00/Megatron-LM into …
jaredcasper Jun 11, 2021
90e0a0d
Merge branch 'github-pr' into 'main'
jaredcasper Jun 11, 2021
2812b2c
zero.Init() with mpu
tjruwase Aug 2, 2021
7222a97
Remove debug print
tjruwase Aug 2, 2021
3644a9d
Manual revert training.py to main verion
tjruwase Aug 2, 2021
3e10eba
Merge branch 'master' of github.com:microsoft/Megatron-DeepSpeed into…
tjruwase Aug 9, 2021
3451116
Use new argument name
tjruwase Aug 9, 2021
d64338a
Revert "Merge branch 'master' of github.com:microsoft/Megatron-DeepSp…
tjruwase Aug 9, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 11 additions & 5 deletions pretrain_gpt.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
import os
import subprocess


def model_provider(pre_process=True, post_process=True):
"""Build the model."""

Expand All @@ -41,9 +42,10 @@ def model_provider(pre_process=True, post_process=True):

args = get_args()
with deepspeed.zero.Init(data_parallel_group=mpu.get_data_parallel_group(),
remote_device=None if args.remote_device=='none' else args.remote_device,
config=args.deepspeed_config,
enabled=args.zero_stage==3):
remote_device=None if args.remote_device == 'none' else args.remote_device,
config_dict_or_path=args.deepspeed_config,
enabled=args.zero_stage == 3,
mpu=mpu):
if args.deepspeed:
model = GPTModelPipe(
num_tokentypes=0,
Expand All @@ -59,14 +61,14 @@ def model_provider(pre_process=True, post_process=True):
attention_mask = torch.tril(torch.ones(
(1, args.seq_length, args.seq_length), device=torch.cuda.current_device())).view(
1, 1, args.seq_length, args.seq_length)

# Convert attention mask to binary:
attention_mask = (attention_mask < 0.5)
if args.fp16:
attention_mask = attention_mask.half()
elif args.bf16:
attention_mask = attention_mask.bfloat16()

args.attn_mask = attention_mask

else:
Expand Down Expand Up @@ -111,6 +113,7 @@ def get_batch(data_iterator):

return tokens, labels, loss_mask, attention_mask, position_ids


def get_batch_pipe(data):
"""Modification of `get_batch` to work on `next(data_iterator)` instead of `data_iterator`"""
args = get_args()
Expand Down Expand Up @@ -138,6 +141,7 @@ def get_batch_pipe(data):

return (tokens, position_ids, attention_mask), (labels, loss_mask)


def loss_func(loss_mask, output_tensor):
losses = output_tensor.float()
loss_mask = loss_mask.view(-1).float()
Expand Down Expand Up @@ -184,10 +188,12 @@ def train_valid_test_datasets_provider(train_val_test_num_samples):

return train_ds, valid_ds, test_ds


def command_exists(cmd):
result = subprocess.Popen(f'type {cmd}', stdout=subprocess.PIPE, shell=True)
return result.wait() == 0


def git_ds_info():
from deepspeed.env_report import main as ds_report
ds_report()
Expand Down