Skip to content

Commit

Permalink
Merge branch 'main' into asr_with_tts_tutorial
Browse files Browse the repository at this point in the history
  • Loading branch information
artbataev authored Jul 6, 2023
2 parents 0894b32 + 1744718 commit 641c2f8
Show file tree
Hide file tree
Showing 19 changed files with 31 additions and 49 deletions.
10 changes: 5 additions & 5 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.

ARG BASE_IMAGE=nvcr.io/nvidia/pytorch:23.04-py3
ARG BASE_IMAGE=nvcr.io/nvidia/pytorch:23.06-py3

# build an image that includes only the nemo dependencies, ensures that dependencies
# are included first for optimal caching, and useful for building a development
Expand Down Expand Up @@ -45,11 +45,11 @@ RUN apt-get update && \
WORKDIR /workspace/

WORKDIR /tmp/
# TODO: Remove once this Apex commit (2/24/23) is included in PyTorch
# TODO: Remove once this Apex commit (5/12/23) is included in PyTorch
# container
RUN git clone https://github.com/NVIDIA/apex.git && \
cd apex && \
git checkout 57057e2fcf1c084c0fcc818f55c0ff6ea1b24ae2 && \
git checkout 8b7a1ff183741dd8f9b87e7bafd04cfde99cea28 && \
pip3 install -v --disable-pip-version-check --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" --global-option="--fast_layer_norm" --global-option="--distributed_adam" --global-option="--deprecated_fused_adam" ./

# uninstall stuff from base container
Expand All @@ -75,7 +75,7 @@ RUN for f in $(ls requirements*.txt); do pip3 install --disable-pip-version-chec
# install flash attention dependencies
RUN pip install flash-attn
# pinned triton version for flash-attention https://github.com/HazyResearch/flash-attention/blob/main/flash_attn/flash_attn_triton.py#L3
RUN pip install triton==2.0.0.dev20221202
RUN pip install triton==2.0.0.dev20221202

# install k2, skip if installation fails
COPY scripts /tmp/nemo/scripts/
Expand All @@ -94,7 +94,7 @@ COPY . .

# start building the final container
FROM nemo-deps as nemo
ARG NEMO_VERSION=1.19.0
ARG NEMO_VERSION=1.20.0

# Check that NEMO_VERSION is set. Build will fail without this. Expose NEMO and base container
# version information as runtime environment variable for introspection purposes
Expand Down
2 changes: 1 addition & 1 deletion Jenkinsfile
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
pipeline {
agent {
docker {
image 'nvcr.io/nvidia/pytorch:23.04-py3'
image 'nvcr.io/nvidia/pytorch:23.06-py3'
args '--device=/dev/nvidia0 --gpus all --user 0:128 -v /home/TestData:/home/TestData -v $HOME/.cache:/root/.cache --shm-size=8g --env TRANSFORMERS_OFFLINE=1'
}
}
Expand Down
4 changes: 2 additions & 2 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -319,13 +319,13 @@ To build a nemo container with Dockerfile from a branch, please run
DOCKER_BUILDKIT=1 docker build -f Dockerfile -t nemo:latest .
If you chose to work with main branch, we recommend using NVIDIA's PyTorch container version 23.04-py3 and then installing from GitHub.
If you chose to work with main branch, we recommend using NVIDIA's PyTorch container version 23.06-py3 and then installing from GitHub.

.. code-block:: bash
docker run --gpus all -it --rm -v <nemo_github_folder>:/NeMo --shm-size=8g \
-p 8888:8888 -p 6006:6006 --ulimit memlock=-1 --ulimit \
stack=67108864 --device=/dev/snd nvcr.io/nvidia/pytorch:23.04-py3
stack=67108864 --device=/dev/snd nvcr.io/nvidia/pytorch:23.06-py3
Examples
--------
Expand Down
2 changes: 1 addition & 1 deletion nemo/collections/asr/metrics/rnnt_wer.py
Original file line number Diff line number Diff line change
Expand Up @@ -1224,7 +1224,7 @@ def validation_epoch_end(self, outputs):
def __init__(
self, decoding: RNNTDecoding, batch_dim_index=0, use_cer=False, log_prediction=True, dist_sync_on_step=False
):
super(RNNTWER, self).__init__(dist_sync_on_step=dist_sync_on_step, compute_on_step=False)
super(RNNTWER, self).__init__(dist_sync_on_step=dist_sync_on_step)
self.decoding = decoding
self.batch_dim_index = batch_dim_index
self.use_cer = use_cer
Expand Down
2 changes: 1 addition & 1 deletion nemo/collections/asr/metrics/rnnt_wer_bpe.py
Original file line number Diff line number Diff line change
Expand Up @@ -359,7 +359,7 @@ def __init__(
log_prediction: bool = True,
dist_sync_on_step=False,
):
super(RNNTBPEWER, self).__init__(dist_sync_on_step=dist_sync_on_step, compute_on_step=False)
super(RNNTBPEWER, self).__init__(dist_sync_on_step=dist_sync_on_step)
self.decoding = decoding
self.batch_dim_index = batch_dim_index
self.use_cer = use_cer
Expand Down
2 changes: 1 addition & 1 deletion nemo/collections/asr/metrics/wer.py
Original file line number Diff line number Diff line change
Expand Up @@ -1125,7 +1125,7 @@ def __init__(
fold_consecutive=True,
dist_sync_on_step=False,
):
super().__init__(dist_sync_on_step=dist_sync_on_step, compute_on_step=False)
super().__init__(dist_sync_on_step=dist_sync_on_step)

self.decoding = decoding
self.use_cer = use_cer
Expand Down
2 changes: 1 addition & 1 deletion nemo/collections/asr/metrics/wer_bpe.py
Original file line number Diff line number Diff line change
Expand Up @@ -247,7 +247,7 @@ def __init__(
fold_consecutive=True,
dist_sync_on_step=False,
):
super().__init__(dist_sync_on_step=dist_sync_on_step, compute_on_step=False)
super().__init__(dist_sync_on_step=dist_sync_on_step)
self.decoding = decoding
self.tokenizer = self.decoding.tokenizer
self.blank_id = self.decoding.tokenizer.tokenizer.vocab_size
Expand Down
9 changes: 2 additions & 7 deletions nemo/collections/common/metrics/global_average_loss_metric.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,9 +28,6 @@ class GlobalAverageLossMetric(Metric):
See :doc:`PyTorch Lightning Metrics<pytorch-lightning:metrics>` for the metric usage instruction.
Args:
compute_on_step:
The method :meth:`forward` only calls ``update()`` and returns ``None`` if this is set to ``False``.
default: ``True``
dist_sync_on_step:
Synchronize metric state across processes at each method :meth:`forward` call before returning the
value at the step
Expand All @@ -44,10 +41,8 @@ class GlobalAverageLossMetric(Metric):

full_state_update = True

def __init__(self, compute_on_step=True, dist_sync_on_step=False, process_group=None, take_avg_loss=True):
super().__init__(
compute_on_step=compute_on_step, dist_sync_on_step=dist_sync_on_step, process_group=process_group
)
def __init__(self, dist_sync_on_step=False, process_group=None, take_avg_loss=True):
super().__init__(dist_sync_on_step=dist_sync_on_step, process_group=process_group)
self.add_state("loss_sum", torch.tensor(0.0, dtype=torch.float64), dist_reduce_fx='sum')
self.add_state("num_measurements", torch.tensor(0, dtype=torch.int64), dist_reduce_fx='sum')
self.take_avg_loss = take_avg_loss
Expand Down
8 changes: 2 additions & 6 deletions nemo/collections/common/metrics/perplexity.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,6 @@ class Perplexity(Metric):
See `PyTorch Lightning Metrics <https://pytorch-lightning.readthedocs.io/en/stable/ecosystem/metrics.html>`_ for the metric usage instructions.
Args:
compute_on_step:
Forward only calls ``update()`` and returns ``None`` if this is set to ``False``. default: ``True``
dist_sync_on_step:
Synchronize metric state across processes at each ``forward()``
before returning the value at the step.
Expand All @@ -44,10 +42,8 @@ class Perplexity(Metric):

full_state_update = True

def __init__(self, compute_on_step=True, dist_sync_on_step=False, process_group=None, validate_args=True):
super().__init__(
compute_on_step=compute_on_step, dist_sync_on_step=dist_sync_on_step, process_group=process_group
)
def __init__(self, dist_sync_on_step=False, process_group=None, validate_args=True):
super().__init__(dist_sync_on_step=dist_sync_on_step, process_group=process_group)
self.validate_args = validate_args
self.add_state('perplexities_sum', torch.tensor(0.0, dtype=torch.float64), dist_reduce_fx='sum')
# Total number of distributions seen since last reset
Expand Down
9 changes: 2 additions & 7 deletions nemo/collections/nlp/metrics/sequence_perplexity.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,6 @@ class SequencePerplexity(Metric):
See :doc:`PyTorch Lightning Metrics<pytorch-lightning:metrics>` for the metric usage instructions.
Args:
compute_on_step:
Forward only calls ``update()`` and returns ``None`` if this is set to ``False``. default: ``True``
dist_sync_on_step:
Synchronize metric state across processes at each ``forward()`` before returning the value at the step.
process_group:
Expand All @@ -43,12 +41,9 @@ class SequencePerplexity(Metric):
to perform the allgather.
"""

def __init__(self, compute_on_step=True, dist_sync_on_step=False, process_group=None, dist_sync_fn=None):
def __init__(self, dist_sync_on_step=False, process_group=None, dist_sync_fn=None):
super().__init__(
compute_on_step=compute_on_step,
dist_sync_on_step=dist_sync_on_step,
process_group=process_group,
dist_sync_fn=dist_sync_fn,
dist_sync_on_step=dist_sync_on_step, process_group=process_group, dist_sync_fn=dist_sync_fn,
)

# Total sum of exponentiated average negative log likelihoods
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,7 @@ def __init__(self, cfg: DictConfig, trainer: Trainer = None):
# create extra bias

# setup to track metrics
self.validation_perplexity = Perplexity(compute_on_step=False)
self.validation_perplexity = Perplexity()

self.setup_optimization(cfg.optim)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@ def __init__(self, cfg: DictConfig, trainer: Trainer = None):
decoder=cfg.language_model.pretrained_decoder_model_name,
)

self.validation_perplexity = Perplexity(compute_on_step=False)
self.validation_perplexity = Perplexity()

self.setup_optimization(cfg.optim)

Expand Down
12 changes: 6 additions & 6 deletions nemo/core/optim/distributed_adam.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
from apex.contrib.optimizers.distributed_fused_adam import (
DistributedFusedAdam,
_coalescing_manager,
_coalescing_manager_append_work,
_disable_pre_forward_hook,
)
from megatron.core import parallel_state
Expand Down Expand Up @@ -173,16 +174,15 @@ def _fp32_optim_grad_sync(self):
for model_param, main_param in self._fp32_optim_main_params.items():
if model_param.grad is not None:
main_param.grad += model_param.grad.detach()
sync_requests = []
with _coalescing_manager(self.process_group, self.device, sync_requests):
with _coalescing_manager(self.process_group, self.device, async_ops=True) as cm:
for main_param in self._fp32_optim_main_params.values():
sync_requests.append(
_coalescing_manager_append_work(
cm,
torch.distributed.all_reduce(
main_param.grad, op=torch.distributed.ReduceOp.AVG, group=self.process_group, async_op=True,
)
),
)
for req in sync_requests:
req.wait()
cm.wait()
self._fp32_optim_grad_sync_needed = False

def zero_grad(self, *args, **kwargs):
Expand Down
1 change: 0 additions & 1 deletion nemo/core/optim/optimizers.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,6 @@
AVAILABLE_OPTIMIZERS['fused_adam'] = FusedAdam
except ModuleNotFoundError:
HAVE_APEX = False
logging.warning("Apex was not found. Using the lamb or fused_adam optimizer will error out.")

HAVE_APEX_DISTRIBUTED_ADAM = False
if HAVE_APEX:
Expand Down
1 change: 0 additions & 1 deletion requirements/requirements_asr.txt
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
braceexpand
editdistance
g2p_en
inflect
ipywidgets
jiwer
kaldi-python-io
Expand Down
2 changes: 2 additions & 0 deletions requirements/requirements_common.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
inflect
pandas
pydantic<2 # remove after inflect supports Pydantic 2.0+
sacremoses>=0.0.43
sentencepiece<1.0.0
youtokentome>=1.0.5
1 change: 0 additions & 1 deletion requirements/requirements_nlp.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@ ftfy
gdown
h5py
ijson
inflect
jieba
markdown2
matplotlib>=3.3.2
Expand Down
1 change: 0 additions & 1 deletion requirements/requirements_tts.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
attrdict
einops
inflect
jieba
kornia
librosa
Expand Down
8 changes: 3 additions & 5 deletions tests/collections/common/pl_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ def _class_test(
calculated across devices for each batch (and not just at the end)
"""
# Instanciate lightning metric
metric = metric_class(compute_on_step=True, dist_sync_on_step=dist_sync_on_step, **metric_args)
metric = metric_class(dist_sync_on_step=dist_sync_on_step, **metric_args)

# verify metrics work after being loaded from pickled state
pickled_metric = pickle.dumps(metric)
Expand Down Expand Up @@ -303,7 +303,7 @@ def _perplexity_class_test(
calculated across devices for each batch (and not just at the end)
"""
# Instanciate lightning metric
perplexity = Perplexity(compute_on_step=True, dist_sync_on_step=dist_sync_on_step, **metric_args)
perplexity = Perplexity(dist_sync_on_step=dist_sync_on_step, **metric_args)
if (probs is None) == (logits is None):
with pytest.raises(ValueError):
perplexity(probs, logits)
Expand Down Expand Up @@ -464,9 +464,7 @@ def _loss_class_test(
calculated across devices for each batch (and not just at the end)
"""
# Instantiate lightning metric
loss_metric = GlobalAverageLossMetric(
compute_on_step=True, dist_sync_on_step=dist_sync_on_step, take_avg_loss=take_avg_loss
)
loss_metric = GlobalAverageLossMetric(dist_sync_on_step=dist_sync_on_step, take_avg_loss=take_avg_loss)

# verify loss works after being loaded from pickled state
pickled_metric = pickle.dumps(loss_metric)
Expand Down

0 comments on commit 641c2f8

Please sign in to comment.