Skip to content

Commit

Permalink
additional instruction for the grad_scale is too small error (#1550)
Browse files Browse the repository at this point in the history
  • Loading branch information
JinZr authored Mar 14, 2024
1 parent 15bd9a8 commit eb132da
Show file tree
Hide file tree
Showing 49 changed files with 145 additions and 147 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,7 @@
)
from icefall.dist import cleanup_dist, setup_dist
from icefall.env import get_env_info
from icefall.err import raise_grad_scale_is_too_small_error
from icefall.lexicon import Lexicon
from icefall.utils import (
AttributeDict,
Expand Down Expand Up @@ -881,9 +882,7 @@ def train_one_epoch(
if cur_grad_scale < 0.01:
logging.warning(f"Grad scale is small: {cur_grad_scale}")
if cur_grad_scale < 1.0e-05:
raise RuntimeError(
f"grad_scale is too small, exiting: {cur_grad_scale}"
)
raise_grad_scale_is_too_small_error()
if batch_idx % params.log_interval == 0:
cur_lr = scheduler.get_last_lr()[0]
cur_grad_scale = scaler._scale.item() if params.use_fp16 else 1.0
Expand Down
5 changes: 2 additions & 3 deletions egs/aishell/ASR/pruned_transducer_stateless7/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,7 @@
)
from icefall.dist import cleanup_dist, setup_dist
from icefall.env import get_env_info
from icefall.err import raise_grad_scale_is_too_small_error
from icefall.hooks import register_inf_check_hooks
from icefall.lexicon import Lexicon
from icefall.utils import (
Expand Down Expand Up @@ -878,9 +879,7 @@ def train_one_epoch(
if cur_grad_scale < 0.01:
logging.warning(f"Grad scale is small: {cur_grad_scale}")
if cur_grad_scale < 1.0e-05:
raise RuntimeError(
f"grad_scale is too small, exiting: {cur_grad_scale}"
)
raise_grad_scale_is_too_small_error(cur_grad_scale)
if batch_idx % params.log_interval == 0:
cur_lr = scheduler.get_last_lr()[0]
cur_grad_scale = scaler._scale.item() if params.use_fp16 else 1.0
Expand Down
5 changes: 2 additions & 3 deletions egs/aishell/ASR/pruned_transducer_stateless7_bbpe/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@
)
from icefall.dist import cleanup_dist, setup_dist
from icefall.env import get_env_info
from icefall.err import raise_grad_scale_is_too_small_error
from icefall.hooks import register_inf_check_hooks
from icefall.utils import (
AttributeDict,
Expand Down Expand Up @@ -871,9 +872,7 @@ def train_one_epoch(
if cur_grad_scale < 0.01:
logging.warning(f"Grad scale is small: {cur_grad_scale}")
if cur_grad_scale < 1.0e-05:
raise RuntimeError(
f"grad_scale is too small, exiting: {cur_grad_scale}"
)
raise_grad_scale_is_too_small_error(cur_grad_scale)

if batch_idx % params.log_interval == 0:
cur_lr = scheduler.get_last_lr()[0]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@
)
from icefall.dist import cleanup_dist, setup_dist
from icefall.env import get_env_info
from icefall.err import raise_grad_scale_is_too_small_error
from icefall.hooks import register_inf_check_hooks
from icefall.lexicon import Lexicon
from icefall.utils import AttributeDict, MetricsTracker, setup_logger, str2bool
Expand Down Expand Up @@ -882,9 +883,7 @@ def train_one_epoch(
if cur_grad_scale < 0.01:
logging.warning(f"Grad scale is small: {cur_grad_scale}")
if cur_grad_scale < 1.0e-05:
raise RuntimeError(
f"grad_scale is too small, exiting: {cur_grad_scale}"
)
raise_grad_scale_is_too_small_error(cur_grad_scale)

if batch_idx % params.log_interval == 0:
cur_lr = scheduler.get_last_lr()[0]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@
)
from icefall.dist import cleanup_dist, setup_dist
from icefall.env import get_env_info
from icefall.err import raise_grad_scale_is_too_small_error
from icefall.hooks import register_inf_check_hooks
from icefall.lexicon import Lexicon
from icefall.utils import AttributeDict, MetricsTracker, setup_logger, str2bool
Expand Down Expand Up @@ -881,9 +882,7 @@ def train_one_epoch(
if cur_grad_scale < 0.01:
logging.warning(f"Grad scale is small: {cur_grad_scale}")
if cur_grad_scale < 1.0e-05:
raise RuntimeError(
f"grad_scale is too small, exiting: {cur_grad_scale}"
)
raise_grad_scale_is_too_small_error(cur_grad_scale)

if batch_idx % params.log_interval == 0:
cur_lr = scheduler.get_last_lr()[0]
Expand Down
5 changes: 2 additions & 3 deletions egs/aishell/ASR/zipformer/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,7 @@
)
from icefall.dist import cleanup_dist, setup_dist
from icefall.env import get_env_info
from icefall.err import raise_grad_scale_is_too_small_error
from icefall.hooks import register_inf_check_hooks
from icefall.lexicon import Lexicon
from icefall.utils import (
Expand Down Expand Up @@ -985,9 +986,7 @@ def save_bad_model(suffix: str = ""):
logging.warning(f"Grad scale is small: {cur_grad_scale}")
if cur_grad_scale < 1.0e-05:
save_bad_model()
raise RuntimeError(
f"grad_scale is too small, exiting: {cur_grad_scale}"
)
raise_grad_scale_is_too_small_error(cur_grad_scale)

if batch_idx % params.log_interval == 0:
cur_lr = max(scheduler.get_last_lr())
Expand Down
5 changes: 2 additions & 3 deletions egs/aishell/ASR/zipformer/train_bbpe.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,7 @@
update_averaged_model,
)
from icefall.dist import cleanup_dist, setup_dist
from icefall.err import raise_grad_scale_is_too_small_error
from icefall.hooks import register_inf_check_hooks
from icefall.utils import (
AttributeDict,
Expand Down Expand Up @@ -570,9 +571,7 @@ def save_bad_model(suffix: str = ""):
logging.warning(f"Grad scale is small: {cur_grad_scale}")
if cur_grad_scale < 1.0e-05:
save_bad_model()
raise RuntimeError(
f"grad_scale is too small, exiting: {cur_grad_scale}"
)
raise_grad_scale_is_too_small_error(cur_grad_scale)

if batch_idx % params.log_interval == 0:
cur_lr = max(scheduler.get_last_lr())
Expand Down
5 changes: 2 additions & 3 deletions egs/alimeeting/ASR_v2/pruned_transducer_stateless7/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@
)
from icefall.dist import cleanup_dist, setup_dist
from icefall.env import get_env_info
from icefall.err import raise_grad_scale_is_too_small_error
from icefall.hooks import register_inf_check_hooks
from icefall.lexicon import Lexicon
from icefall.utils import AttributeDict, MetricsTracker, setup_logger, str2bool
Expand Down Expand Up @@ -851,9 +852,7 @@ def train_one_epoch(
if cur_grad_scale < 0.01:
logging.warning(f"Grad scale is small: {cur_grad_scale}")
if cur_grad_scale < 1.0e-05:
raise RuntimeError(
f"grad_scale is too small, exiting: {cur_grad_scale}"
)
raise_grad_scale_is_too_small_error(cur_grad_scale)

if batch_idx % params.log_interval == 0:
cur_lr = scheduler.get_last_lr()[0]
Expand Down
5 changes: 2 additions & 3 deletions egs/ami/ASR/pruned_transducer_stateless7/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@
)
from icefall.dist import cleanup_dist, setup_dist
from icefall.env import get_env_info
from icefall.err import raise_grad_scale_is_too_small_error
from icefall.hooks import register_inf_check_hooks
from icefall.utils import AttributeDict, MetricsTracker, setup_logger, str2bool

Expand Down Expand Up @@ -842,9 +843,7 @@ def train_one_epoch(
if cur_grad_scale < 0.01:
logging.warning(f"Grad scale is small: {cur_grad_scale}")
if cur_grad_scale < 1.0e-05:
raise RuntimeError(
f"grad_scale is too small, exiting: {cur_grad_scale}"
)
raise_grad_scale_is_too_small_error(cur_grad_scale)

if batch_idx % params.log_interval == 0:
cur_lr = scheduler.get_last_lr()[0]
Expand Down
5 changes: 2 additions & 3 deletions egs/ami/SURT/dprnn_zipformer/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,7 @@
)
from icefall.dist import cleanup_dist, setup_dist
from icefall.env import get_env_info
from icefall.err import raise_grad_scale_is_too_small_error
from icefall.utils import AttributeDict, MetricsTracker, setup_logger, str2bool

LRSchedulerType = Union[torch.optim.lr_scheduler._LRScheduler, optim.LRScheduler]
Expand Down Expand Up @@ -1138,9 +1139,7 @@ def train_one_epoch(
if cur_grad_scale < 0.01:
logging.warning(f"Grad scale is small: {cur_grad_scale}")
if cur_grad_scale < 1.0e-05:
raise RuntimeError(
f"grad_scale is too small, exiting: {cur_grad_scale}"
)
raise_grad_scale_is_too_small_error(cur_grad_scale)

if batch_idx % params.log_interval == 0:
cur_lr = scheduler.get_last_lr()[0]
Expand Down
5 changes: 2 additions & 3 deletions egs/ami/SURT/dprnn_zipformer/train_adapt.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,7 @@
)
from icefall.dist import cleanup_dist, setup_dist
from icefall.env import get_env_info
from icefall.err import raise_grad_scale_is_too_small_error
from icefall.utils import AttributeDict, MetricsTracker, setup_logger, str2bool

LRSchedulerType = Union[torch.optim.lr_scheduler._LRScheduler, optim.LRScheduler]
Expand Down Expand Up @@ -1129,9 +1130,7 @@ def train_one_epoch(
if cur_grad_scale < 0.01:
logging.warning(f"Grad scale is small: {cur_grad_scale}")
if cur_grad_scale < 1.0e-05:
raise RuntimeError(
f"grad_scale is too small, exiting: {cur_grad_scale}"
)
raise_grad_scale_is_too_small_error(cur_grad_scale)

if batch_idx % params.log_interval == 0:
cur_lr = scheduler.get_last_lr()[0]
Expand Down
5 changes: 2 additions & 3 deletions egs/commonvoice/ASR/pruned_transducer_stateless7/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,7 @@
)
from icefall.dist import cleanup_dist, setup_dist
from icefall.env import get_env_info
from icefall.err import raise_grad_scale_is_too_small_error
from icefall.hooks import register_inf_check_hooks
from icefall.utils import (
AttributeDict,
Expand Down Expand Up @@ -871,9 +872,7 @@ def train_one_epoch(
if cur_grad_scale < 0.01:
logging.warning(f"Grad scale is small: {cur_grad_scale}")
if cur_grad_scale < 1.0e-05:
raise RuntimeError(
f"grad_scale is too small, exiting: {cur_grad_scale}"
)
raise_grad_scale_is_too_small_error(cur_grad_scale)

if batch_idx % params.log_interval == 0:
cur_lr = scheduler.get_last_lr()[0]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -889,9 +889,7 @@ def train_one_epoch(
if cur_grad_scale < 0.01:
logging.warning(f"Grad scale is small: {cur_grad_scale}")
if cur_grad_scale < 1.0e-05:
raise RuntimeError(
f"grad_scale is too small, exiting: {cur_grad_scale}"
)
raise RuntimeError(f", exiting: {cur_grad_scale}")

if batch_idx % params.log_interval == 0:
cur_lr = scheduler.get_last_lr()[0]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,7 @@
)
from icefall.dist import cleanup_dist, setup_dist
from icefall.env import get_env_info
from icefall.err import raise_grad_scale_is_too_small_error
from icefall.hooks import register_inf_check_hooks
from icefall.utils import (
AttributeDict,
Expand Down Expand Up @@ -965,9 +966,7 @@ def train_one_epoch(
if cur_grad_scale < 0.01:
logging.warning(f"Grad scale is small: {cur_grad_scale}")
if cur_grad_scale < 1.0e-05:
raise RuntimeError(
f"grad_scale is too small, exiting: {cur_grad_scale}"
)
raise_grad_scale_is_too_small_error(cur_grad_scale)

if batch_idx % params.log_interval == 0:
cur_lr = scheduler.get_last_lr()[0]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@
)
from icefall.dist import cleanup_dist, setup_dist
from icefall.env import get_env_info
from icefall.err import raise_grad_scale_is_too_small_error
from icefall.hooks import register_inf_check_hooks
from icefall.utils import AttributeDict, MetricsTracker, setup_logger, str2bool

Expand Down Expand Up @@ -888,9 +889,7 @@ def train_one_epoch(
if cur_grad_scale < 0.01:
logging.warning(f"Grad scale is small: {cur_grad_scale}")
if cur_grad_scale < 1.0e-05:
raise RuntimeError(
f"grad_scale is too small, exiting: {cur_grad_scale}"
)
raise_grad_scale_is_too_small_error(cur_grad_scale)

if batch_idx % params.log_interval == 0:
cur_lr = scheduler.get_last_lr()[0]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,7 @@
)
from icefall.dist import cleanup_dist, setup_dist
from icefall.env import get_env_info
from icefall.err import raise_grad_scale_is_too_small_error
from icefall.hooks import register_inf_check_hooks
from icefall.utils import AttributeDict, MetricsTracker, setup_logger, str2bool

Expand Down Expand Up @@ -909,9 +910,7 @@ def train_one_epoch(
if cur_grad_scale < 0.01:
logging.warning(f"Grad scale is small: {cur_grad_scale}")
if cur_grad_scale < 1.0e-05:
raise RuntimeError(
f"grad_scale is too small, exiting: {cur_grad_scale}"
)
raise_grad_scale_is_too_small_error(cur_grad_scale)

if batch_idx % params.log_interval == 0:
cur_lr = scheduler.get_last_lr()[0]
Expand Down
5 changes: 2 additions & 3 deletions egs/csj/ASR/pruned_transducer_stateless7_streaming/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,7 @@
)
from icefall.dist import cleanup_dist, setup_dist
from icefall.env import get_env_info
from icefall.err import raise_grad_scale_is_too_small_error
from icefall.hooks import register_inf_check_hooks
from icefall.utils import AttributeDict, MetricsTracker, setup_logger, str2bool

Expand Down Expand Up @@ -908,9 +909,7 @@ def train_one_epoch(
if cur_grad_scale < 0.01:
logging.warning(f"Grad scale is small: {cur_grad_scale}")
if cur_grad_scale < 1.0e-05:
raise RuntimeError(
f"grad_scale is too small, exiting: {cur_grad_scale}"
)
raise_grad_scale_is_too_small_error(cur_grad_scale)

if batch_idx % params.log_interval == 0:
cur_lr = scheduler.get_last_lr()[0]
Expand Down
5 changes: 2 additions & 3 deletions egs/gigaspeech/ASR/zipformer/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,7 @@
)
from icefall.dist import cleanup_dist, setup_dist
from icefall.env import get_env_info
from icefall.err import raise_grad_scale_is_too_small_error
from icefall.hooks import register_inf_check_hooks
from icefall.utils import (
AttributeDict,
Expand Down Expand Up @@ -1031,9 +1032,7 @@ def save_bad_model(suffix: str = ""):
logging.warning(f"Grad scale is small: {cur_grad_scale}")
if cur_grad_scale < 1.0e-05:
save_bad_model()
raise RuntimeError(
f"grad_scale is too small, exiting: {cur_grad_scale}"
)
raise_grad_scale_is_too_small_error(cur_grad_scale)

if batch_idx % params.log_interval == 0:
cur_lr = max(scheduler.get_last_lr())
Expand Down
5 changes: 2 additions & 3 deletions egs/gigaspeech/KWS/zipformer/finetune.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,7 @@
)
from icefall.dist import cleanup_dist, setup_dist
from icefall.env import get_env_info
from icefall.err import raise_grad_scale_is_too_small_error
from icefall.hooks import register_inf_check_hooks
from icefall.utils import (
AttributeDict,
Expand Down Expand Up @@ -371,9 +372,7 @@ def save_bad_model(suffix: str = ""):
logging.warning(f"Grad scale is small: {cur_grad_scale}")
if cur_grad_scale < 1.0e-05:
save_bad_model()
raise RuntimeError(
f"grad_scale is too small, exiting: {cur_grad_scale}"
)
raise_grad_scale_is_too_small_error(cur_grad_scale)

if batch_idx % params.log_interval == 0:
cur_lr = max(scheduler.get_last_lr())
Expand Down
5 changes: 2 additions & 3 deletions egs/gigaspeech/KWS/zipformer/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,7 @@
)
from icefall.dist import cleanup_dist, setup_dist
from icefall.env import get_env_info
from icefall.err import raise_grad_scale_is_too_small_error
from icefall.hooks import register_inf_check_hooks
from icefall.utils import (
AttributeDict,
Expand Down Expand Up @@ -1034,9 +1035,7 @@ def save_bad_model(suffix: str = ""):
logging.warning(f"Grad scale is small: {cur_grad_scale}")
if cur_grad_scale < 1.0e-05:
save_bad_model()
raise RuntimeError(
f"grad_scale is too small, exiting: {cur_grad_scale}"
)
raise_grad_scale_is_too_small_error(cur_grad_scale)

if batch_idx % params.log_interval == 0:
cur_lr = max(scheduler.get_last_lr())
Expand Down
5 changes: 2 additions & 3 deletions egs/libricss/SURT/dprnn_zipformer/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,7 @@
)
from icefall.dist import cleanup_dist, setup_dist
from icefall.env import get_env_info
from icefall.err import raise_grad_scale_is_too_small_error
from icefall.utils import AttributeDict, MetricsTracker, setup_logger, str2bool

LRSchedulerType = Union[torch.optim.lr_scheduler._LRScheduler, optim.LRScheduler]
Expand Down Expand Up @@ -1169,9 +1170,7 @@ def train_one_epoch(
if cur_grad_scale < 0.01:
logging.warning(f"Grad scale is small: {cur_grad_scale}")
if cur_grad_scale < 1.0e-05:
raise RuntimeError(
f"grad_scale is too small, exiting: {cur_grad_scale}"
)
raise_grad_scale_is_too_small_error(cur_grad_scale)

if batch_idx % params.log_interval == 0:
cur_lr = scheduler.get_last_lr()[0]
Expand Down
Loading

0 comments on commit eb132da

Please sign in to comment.