Skip to content

Commit

Permalink
Merging main and resolving conflicts
Browse files Browse the repository at this point in the history
Signed-off-by: taejinp <tango4j@gmail.com>
  • Loading branch information
tango4j committed Nov 29, 2024
2 parents 2c6eed7 + 2a6d144 commit f469e72
Show file tree
Hide file tree
Showing 23 changed files with 1,188 additions and 106 deletions.
9 changes: 8 additions & 1 deletion .github/workflows/release.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,15 @@ on:
description: Ref (SHA or branch name) to release
required: true
type: string
dry-run:
description: Do not publish a wheel and GitHub release.
required: true
default: true
type: boolean

jobs:
release:
uses: NVIDIA/NeMo-FW-CI-templates/.github/workflows/_release_library.yml@v0.12.3
uses: NVIDIA/NeMo-FW-CI-templates/.github/workflows/_release_library.yml@v0.15.0
with:
release-ref: ${{ inputs.release-ref }}
image-name: nemo_container
Expand All @@ -35,8 +40,10 @@ jobs:
python-package: nemo
container-workdir: /workspace
library-name: Neural Modules
dry-run: ${{ inputs.dry-run }}
secrets:
TWINE_USERNAME: ${{ secrets.TWINE_USERNAME }}
TWINE_PASSWORD: ${{ secrets.TWINE_PASSWORD }}
SLACK_RELEASE_ENDPOINT: ${{ secrets.SLACK_RELEASE_ENDPOINT }}
PAT: ${{ secrets.PAT }}
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
1 change: 0 additions & 1 deletion nemo/collections/asr/data/audio_to_text_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -871,7 +871,6 @@ def write_on_batch_end(
item["audio_filepath"] = sample.recording.sources[0].source
else:
item["audio_filepath"] = sample.id
item["audio_filepath"] = sample.recording.sources[0].source
item["offset"] = sample.start
item["duration"] = sample.duration
item["text"] = sample.supervisions[0].text or ''
Expand Down
6 changes: 4 additions & 2 deletions nemo/collections/llm/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -265,12 +265,11 @@ def validate(
@run.cli.entrypoint(name="ptq", namespace="llm")
def ptq(
nemo_checkpoint: str,
export_config: ExportConfig,
calib_tp: int = 1,
calib_pp: int = 1,
quantization_config: Annotated[Optional[QuantizationConfig], run.Config[QuantizationConfig]] = None,
export_config: Optional[Union[ExportConfig, run.Config[ExportConfig]]] = None,
) -> Path:
# TODO: Fix "nemo_run.cli.cli_parser.CLIException: An unexpected error occurred (Argument: , Context: {})"
"""
Applies Post-Training Quantization (PTQ) for a model using the specified quantization and export configs. It runs
calibration for a small dataset to collect scaling factors low-precision GEMMs used by desired quantization method.
Expand All @@ -297,6 +296,9 @@ def ptq(
Returns:
Path: The path where the quantized checkpoint has been saved after calibration.
"""
if not quantization_config:
quantization_config = QuantizationConfig()

if export_config.path is None:
raise ValueError("The export_config.path needs to be specified, got None.")

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ def __init__(
model_transform=None,
model_accelerator=None,
trust_remote_code=False,
default_dtype=torch.bfloat16,
):
super().__init__()
self.save_hyperparameters()
Expand All @@ -53,6 +54,7 @@ def __init__(
self.model_transform = model_transform
self.model_accelerator = model_accelerator
self.trust_remote_code = trust_remote_code
self.default_dtype = default_dtype

@property
def tokenizer(self):
Expand All @@ -79,7 +81,10 @@ def configure_model(self):
from transformers import AutoConfig

config = AutoConfig.from_pretrained(self.model_name, trust_remote_code=self.trust_remote_code)
self.model = AutoModelForCausalLM.from_config(config, trust_remote_code=self.trust_remote_code)
dtype = getattr(config, 'torch_dtype', self.default_dtype)
self.model = AutoModelForCausalLM.from_config(
config, torch_dtype=dtype, trust_remote_code=self.trust_remote_code
)

if self.model_accelerator is not None:
self.model_accelerator(self.model)
Expand Down
18 changes: 11 additions & 7 deletions nemo/collections/llm/recipes/t5_11b.py
Original file line number Diff line number Diff line change
Expand Up @@ -175,15 +175,17 @@ def pretrain_recipe(
guide in the `examples/llm/pretrain/` directory.
"""

opt_config = OptimizerConfig(
opt_config = run.Config(
OptimizerConfig,
optimizer='adam',
lr=0.0001,
use_distributed_optimizer=True,
bf16=True,
weight_decay=0.01,
)

lr_scheduler = WarmupAnnealingScheduler(
lr_scheduler = run.Config(
WarmupAnnealingScheduler,
warmup_steps=None,
warmup_ratio=0.01,
max_steps=1000000,
Expand All @@ -202,7 +204,7 @@ def pretrain_recipe(
MockDataModule, seq_length=512, seq_length_dec=128, global_batch_size=1920, micro_batch_size=24
),
log=default_log(dir=dir, name=name, tensorboard_logger=tensorboard_logger(name=name)),
optim=MegatronOptimizerModule(config=opt_config, lr_scheduler=lr_scheduler),
optim=run.Config(MegatronOptimizerModule, config=opt_config, lr_scheduler=lr_scheduler),
resume=default_resume(),
)

Expand Down Expand Up @@ -248,15 +250,17 @@ def finetune_recipe(
on fine-tuning LLMs with NeMo, see the fine-tuning guide in the
`examples/llm/finetune/` directory.
"""
opt_config = OptimizerConfig(
opt_config = run.Config(
OptimizerConfig,
optimizer='adam',
lr=1e-4,
lr=0.0001,
use_distributed_optimizer=True,
bf16=True,
weight_decay=0.01,
)

lr_scheduler = WarmupAnnealingScheduler(
lr_scheduler = run.Config(
WarmupAnnealingScheduler,
warmup_steps=50,
max_steps=2000,
min_lr=0.00001,
Expand All @@ -273,7 +277,7 @@ def finetune_recipe(
SquadDataModule, seq_length=512, seq_length_dec=128, global_batch_size=128, micro_batch_size=1
),
log=default_log(dir=dir, name=name, tensorboard_logger=tensorboard_logger(name=name)),
optim=MegatronOptimizerModule(config=opt_config, lr_scheduler=lr_scheduler),
optim=run.Config(MegatronOptimizerModule, config=opt_config, lr_scheduler=lr_scheduler),
resume=nemo_resume(checkpoint_path),
)

Expand Down
18 changes: 11 additions & 7 deletions nemo/collections/llm/recipes/t5_3b.py
Original file line number Diff line number Diff line change
Expand Up @@ -175,15 +175,17 @@ def pretrain_recipe(
guide in the `examples/llm/pretrain/` directory.
"""

opt_config = OptimizerConfig(
opt_config = run.Config(
OptimizerConfig,
optimizer='adam',
lr=0.0001,
use_distributed_optimizer=True,
bf16=True,
weight_decay=0.01,
)

lr_scheduler = WarmupAnnealingScheduler(
lr_scheduler = run.Config(
WarmupAnnealingScheduler,
warmup_steps=None,
warmup_ratio=0.01,
max_steps=1000000,
Expand All @@ -202,7 +204,7 @@ def pretrain_recipe(
MockDataModule, seq_length=512, seq_length_dec=128, global_batch_size=1920, micro_batch_size=24
),
log=default_log(dir=dir, name=name, tensorboard_logger=tensorboard_logger(name=name)),
optim=MegatronOptimizerModule(config=opt_config, lr_scheduler=lr_scheduler),
optim=run.Config(MegatronOptimizerModule, config=opt_config, lr_scheduler=lr_scheduler),
resume=default_resume(),
)

Expand Down Expand Up @@ -248,15 +250,17 @@ def finetune_recipe(
on fine-tuning LLMs with NeMo, see the fine-tuning guide in the
`examples/llm/finetune/` directory.
"""
opt_config = OptimizerConfig(
opt_config = run.Config(
OptimizerConfig,
optimizer='adam',
lr=1e-4,
lr=0.0001,
use_distributed_optimizer=True,
bf16=True,
weight_decay=0.01,
)

lr_scheduler = WarmupAnnealingScheduler(
lr_scheduler = run.Config(
WarmupAnnealingScheduler,
warmup_steps=50,
max_steps=2000,
min_lr=0.00001,
Expand All @@ -273,7 +277,7 @@ def finetune_recipe(
SquadDataModule, seq_length=512, seq_length_dec=128, global_batch_size=128, micro_batch_size=1
),
log=default_log(dir=dir, name=name, tensorboard_logger=tensorboard_logger(name=name)),
optim=MegatronOptimizerModule(config=opt_config, lr_scheduler=lr_scheduler),
optim=run.Config(MegatronOptimizerModule, config=opt_config, lr_scheduler=lr_scheduler),
resume=nemo_resume(checkpoint_path),
)

Expand Down
10 changes: 6 additions & 4 deletions nemo/collections/multimodal/data/energon/task_encoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,8 @@ class MultiModalTaskEncoder(
and similarity interleaved samples.
This class extends the DefaultTaskEncoder and provides a flexible mechanism to handle and encode
different types of multimodal data. Support for VQA, captioning and interleaved samples is provided by default. It supports registering custom encoders for each sample type
different types of multimodal data. Support for VQA, captioning and interleaved samples is provided by default.
It supports registering custom encoders for each sample type
and provides methods for encoding individual samples, batching them, and further processing the batch
for model input.
"""
Expand All @@ -59,8 +60,8 @@ def __init__(self, tokenizer, image_processor, multimodal_sample_config):
Parameters:
tokenizer (Tokenizer): The tokenizer used for processing text across different sample types.
image_processor (ImageProcessor): The image processor used for preprocessing images across different sample types.
multimodal_sample_config (MultiModalSampleConfig): Configuration object for multimodal samples, including tokens and placeholders.
image_processor (ImageProcessor): The image processor used for preprocessing images.
multimodal_sample_config (MultiModalSampleConfig): MultiModalSampleConfig object.
"""
self.tokenizer = tokenizer
self.encoders: Dict[str, SampleEncoder] = {
Expand Down Expand Up @@ -173,5 +174,6 @@ def encode_batch(self, batch_data: ImageTextRawBatch) -> dict:
position_ids = torch.arange(seq_length, dtype=torch.long)
position_ids = position_ids.unsqueeze(0).repeat(micro_batch_size, 1)
batch_dict['position_ids'] = position_ids
batch_dict['attention_mask'] = None
if 'attention_mask' not in batch_dict:
batch_dict['attention_mask'] = None
return batch_dict
Original file line number Diff line number Diff line change
Expand Up @@ -573,15 +573,23 @@ def _build_samples_mapping(self):
self.samples_mapping = None

def _build_loss_mask(self, processed_example):
seq_boundaries = processed_example['seq_boundaries']
if self.answer_only_loss:
seq_boundaries = processed_example['seq_boundaries']
return np.concatenate(
[
processed_example['loss_mask'][seq_boundaries[i] + 1 : seq_boundaries[i + 1]]
for i in range(len(seq_boundaries) - 1)
]
)
return [1.0] * (len(processed_example['input_ids']) - len(processed_example['seq_boundaries']) + 1)
return np.concatenate(
[
[
0 if x == self.tokenizer.eos_id else 1.0
for x in processed_example['input_ids'][seq_boundaries[i] : seq_boundaries[i + 1] - 1]
]
for i in range(len(seq_boundaries) - 1)
]
)

def _maybe_cast_to_list(self, x):
return [item.tolist() if isinstance(item, np.ndarray) else item for item in x]
Expand Down Expand Up @@ -622,16 +630,40 @@ def collate_fn(self, batch):

position_ids: List[List[int]] = []
cu_seqlens: List[List[int]] = []
cu_seqlens_unpadded: List[List[int]] = []
for item in batch:
position_ids.append([])
cu_seqlens.append([0])
cu_seqlens_unpadded.append([0])
seqlens = np.array(item['seq_boundaries'][1:]) - np.array(item['seq_boundaries'][:-1])
for l in seqlens:
# length minus 1 because input_ids is truncated by 1 for labels
position_ids[-1].extend(list(range(l - 1)))
cu_seqlens[-1].append(cu_seqlens[-1][-1] + l - 1)
# set last seq to the max seq len because rope and attn kernels expect no padding
cu_seqlens[-1][-1] = max_length

# the last seq needs to be the max seq len because rope and attn kernels expect no padding
assert cu_seqlens[-1][-1] <= max_length

# since data is prepadded when cp_size > 1, there may be some extra padding at the end
# of the packed sequence. In this case, we need to add the max seq len to the end.
if cu_seqlens[-1][-1] != max_length:
cu_seqlens[-1].append(max_length)

for i in range(len(item['seq_boundaries']) - 1):
current_seq = item['input_ids'][item['seq_boundaries'][i] : item['seq_boundaries'][i + 1] - 1]

# since the data could be prepadded with tokenizer's eos_id, we can find out the index of all the eos_id
eos_idx = np.where(np.array(current_seq) == self.tokenizer.eos_id)

# The second eos_id index marks the length of the original unpadded sequence if the sequence is
# prepadded for cp_size > 1. Otherwise, there is no extra padding.
seqlen_unpadded = eos_idx[0][0] + 1 if eos_idx[0].any() else len(current_seq)
cu_seqlens_unpadded[-1].append(cu_seqlens_unpadded[-1][-1] + seqlen_unpadded)

# if extra paddings are added in the packed sequence, they can't be counted as
# actual tokens for training
if len(cu_seqlens[-1]) > len(cu_seqlens_unpadded[-1]):
cu_seqlens_unpadded[-1].append(cu_seqlens_unpadded[-1][-1])

assert len(input_ids[0]) == len(
position_ids[0]
Expand All @@ -652,12 +684,16 @@ def collate_fn(self, batch):

if self.return_cu_seqlen:
cu_seqlens = self._collate_item(cu_seqlens, max_length=max(len(l) for l in cu_seqlens) + 1, pad_id=-1)

cu_seqlens_unpadded = self._collate_item(
cu_seqlens_unpadded, max_length=max(len(l) for l in cu_seqlens_unpadded) + 1, pad_id=-1
)
# Pre-generate `cu_seqlens_argmin` and `max_seqlen` as CPU tensor to avoid device-to-host copies.
cu_seqlens = torch.IntTensor(cu_seqlens)
cu_seqlens_argmin = torch.argmin(cu_seqlens, dim=1, keepdim=True)
seqlens = cu_seqlens[:, 1:] - cu_seqlens[:, :-1]
max_seqlen, _ = seqlens.max(dim=1, keepdim=True)
cu_seqlens_unpadded = torch.IntTensor(cu_seqlens_unpadded)
cu_seqlens_unpadded_argmin = torch.argmin(cu_seqlens_unpadded, dim=1, keepdim=True)

processed_batch.update(
{
Expand All @@ -667,6 +703,8 @@ def collate_fn(self, batch):
'cu_seqlens': torch.IntTensor(cu_seqlens), # cu_seqlens_q must be in dtype torch.int32
'cu_seqlens_argmin': cu_seqlens_argmin, # only required for perf
'max_seqlen': max_seqlen, # only required for perf
'cu_seqlens_unpadded': torch.IntTensor(cu_seqlens_unpadded),
'cu_seqlens_unpadded_argmin': cu_seqlens_unpadded_argmin,
}
)
else:
Expand Down
Loading

0 comments on commit f469e72

Please sign in to comment.