2024-04-12 21:49:57,598 INFO [pretrain.py:1104] (3/4) Training started 2024-04-12 21:49:57,599 INFO [pretrain.py:1114] (3/4) Device: cuda:3 2024-04-12 21:49:57,599 INFO [pretrain.py:1115] (3/4) {'best_train_loss': inf, 'best_valid_loss': inf, 'best_train_epoch': -1, 'best_valid_epoch': -1, 'batch_idx_train': 0, 'sub_batch_idx_train': 0, 'log_interval': 50, 'reset_interval': 200, 'valid_interval': 3000, 'env_info': {'k2-version': '1.24.4', 'k2-build-type': 'Release', 'k2-with-cuda': True, 'k2-git-sha1': '2ad3362f13f7e5a01764ba98fa7cca6fb4cf6afc', 'k2-git-date': 'Sat Oct 21 01:47:40 2023', 'lhotse-version': '1.17.0.dev+git.a470186.clean', 'torch-version': '2.1.0', 'torch-cuda-available': True, 'torch-cuda-version': '12.1', 'python-version': '3.1', 'icefall-git-branch': 'k2ssl', 'icefall-git-sha1': '87843e9-dirty', 'icefall-git-date': 'Fri Apr 5 00:29:16 2024', 'icefall-path': '/mnt/host/icefall', 'k2-path': '/opt/conda/lib/python3.10/site-packages/k2/__init__.py', 'lhotse-path': '/opt/conda/lib/python3.10/site-packages/lhotse/__init__.py', 'hostname': 'd938d3903e1d', 'IP address': '172.17.0.2'}, 'world_size': 4, 'master_port': 12354, 'tensorboard': True, 'num_epochs': 34, 'start_epoch': 33, 'start_batch': 0, 'exp_dir': PosixPath('zipformer/exp2/pretrain'), 'base_lr': 0.045, 'lr_batches': 7500, 'lr_epochs': 10.5, 'warmup_batches': 5000, 'warmup_start': 0, 'ref_duration': 600, 'seed': 42, 'print_diagnostics': False, 'sanity_check': False, 'inf_check': True, 'save_every_n': 100000, 'keep_last_k': 30, 'average_period': 200, 'accum_grad': 2, 'use_fp16': True, 'max_keep_size': 9223372036854775807, 'min_keep_size': 32000, 'max_sample_size': 250000, 'num_encoder_layers': '2,2,3,4,3,2', 'downsampling_factor': '1,2,4,8,4,2', 'feedforward_dim': '512,768,1024,1536,1024,768', 'num_heads': '4,4,4,8,4,4', 'encoder_dim': '192,256,384,512,384,256', 'query_head_dim': '32', 'value_head_dim': '12', 'pos_head_dim': '4', 'pos_dim': 48, 'encoder_unmasked_dim': '192,192,256,256,256,192', 'cnn_module_kernel': '31,31,15,15,15,31', 'label_rate': 50, 'sample_rate': 16000, 'extractor_mode': 'default', 'conv_feature_layers': '[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2', 'conv_bias': False, 'feature_grad_mult': 1.0, 'mask_length': 10, 'mask_prob': 0.65, 'mask_selection': 'static', 'mask_other': 0, 'no_mask_overlap': False, 'mask_min_space': 1, 'mask_channel_length': 10, 'mask_channel_prob': 0.0, 'mask_channel_selection': 'static', 'mask_channel_other': 0, 'no_mask_channel_overlap': False, 'mask_channel_min_space': 1, 'skip_masked': False, 'skip_nomask': False, 'checkpoint_activations': False, 'pred_masked_weight': 1, 'pred_nomask_weight': 0, 'loss_weights': [10], 'required_seq_len_multiple': 2, 'attn_type': '', 'pos_enc_type': 'abs', 'logit_temp': 0.1, 'dropout_input': 0.0, 'dropout_features': 0.0, 'num_classes': [504], 'untie_final_proj': False, 'full_libri': True, 'manifest_dir': PosixPath('data/raw'), 'max_duration': 350.0, 'bucketing_sampler': True, 'num_buckets': 30, 'shuffle': True, 'drop_last': True, 'num_workers': 2, 'do_normalize': True, 'random_crop': True} 2024-04-12 21:49:57,599 INFO [pretrain.py:1117] (3/4) About to create model 2024-04-12 21:49:57,619 INFO [pretrain.py:1104] (2/4) Training started 2024-04-12 21:49:57,619 INFO [pretrain.py:1114] (2/4) Device: cuda:2 2024-04-12 21:49:57,619 INFO [pretrain.py:1115] (2/4) {'best_train_loss': inf, 'best_valid_loss': inf, 'best_train_epoch': -1, 'best_valid_epoch': -1, 'batch_idx_train': 0, 'sub_batch_idx_train': 0, 'log_interval': 50, 'reset_interval': 200, 'valid_interval': 3000, 'env_info': {'k2-version': '1.24.4', 'k2-build-type': 'Release', 'k2-with-cuda': True, 'k2-git-sha1': '2ad3362f13f7e5a01764ba98fa7cca6fb4cf6afc', 'k2-git-date': 'Sat Oct 21 01:47:40 2023', 'lhotse-version': '1.17.0.dev+git.a470186.clean', 'torch-version': '2.1.0', 'torch-cuda-available': True, 'torch-cuda-version': '12.1', 'python-version': '3.1', 'icefall-git-branch': 'k2ssl', 'icefall-git-sha1': '87843e9-dirty', 'icefall-git-date': 'Fri Apr 5 00:29:16 2024', 'icefall-path': '/mnt/host/icefall', 'k2-path': '/opt/conda/lib/python3.10/site-packages/k2/__init__.py', 'lhotse-path': '/opt/conda/lib/python3.10/site-packages/lhotse/__init__.py', 'hostname': 'd938d3903e1d', 'IP address': '172.17.0.2'}, 'world_size': 4, 'master_port': 12354, 'tensorboard': True, 'num_epochs': 34, 'start_epoch': 33, 'start_batch': 0, 'exp_dir': PosixPath('zipformer/exp2/pretrain'), 'base_lr': 0.045, 'lr_batches': 7500, 'lr_epochs': 10.5, 'warmup_batches': 5000, 'warmup_start': 0, 'ref_duration': 600, 'seed': 42, 'print_diagnostics': False, 'sanity_check': False, 'inf_check': True, 'save_every_n': 100000, 'keep_last_k': 30, 'average_period': 200, 'accum_grad': 2, 'use_fp16': True, 'max_keep_size': 9223372036854775807, 'min_keep_size': 32000, 'max_sample_size': 250000, 'num_encoder_layers': '2,2,3,4,3,2', 'downsampling_factor': '1,2,4,8,4,2', 'feedforward_dim': '512,768,1024,1536,1024,768', 'num_heads': '4,4,4,8,4,4', 'encoder_dim': '192,256,384,512,384,256', 'query_head_dim': '32', 'value_head_dim': '12', 'pos_head_dim': '4', 'pos_dim': 48, 'encoder_unmasked_dim': '192,192,256,256,256,192', 'cnn_module_kernel': '31,31,15,15,15,31', 'label_rate': 50, 'sample_rate': 16000, 'extractor_mode': 'default', 'conv_feature_layers': '[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2', 'conv_bias': False, 'feature_grad_mult': 1.0, 'mask_length': 10, 'mask_prob': 0.65, 'mask_selection': 'static', 'mask_other': 0, 'no_mask_overlap': False, 'mask_min_space': 1, 'mask_channel_length': 10, 'mask_channel_prob': 0.0, 'mask_channel_selection': 'static', 'mask_channel_other': 0, 'no_mask_channel_overlap': False, 'mask_channel_min_space': 1, 'skip_masked': False, 'skip_nomask': False, 'checkpoint_activations': False, 'pred_masked_weight': 1, 'pred_nomask_weight': 0, 'loss_weights': [10], 'required_seq_len_multiple': 2, 'attn_type': '', 'pos_enc_type': 'abs', 'logit_temp': 0.1, 'dropout_input': 0.0, 'dropout_features': 0.0, 'num_classes': [504], 'untie_final_proj': False, 'full_libri': True, 'manifest_dir': PosixPath('data/raw'), 'max_duration': 350.0, 'bucketing_sampler': True, 'num_buckets': 30, 'shuffle': True, 'drop_last': True, 'num_workers': 2, 'do_normalize': True, 'random_crop': True} 2024-04-12 21:49:57,620 INFO [pretrain.py:1117] (2/4) About to create model 2024-04-12 21:49:58,403 INFO [pretrain.py:1104] (0/4) Training started 2024-04-12 21:49:58,406 INFO [pretrain.py:1114] (0/4) Device: cuda:0 2024-04-12 21:49:58,406 INFO [pretrain.py:1115] (0/4) {'best_train_loss': inf, 'best_valid_loss': inf, 'best_train_epoch': -1, 'best_valid_epoch': -1, 'batch_idx_train': 0, 'sub_batch_idx_train': 0, 'log_interval': 50, 'reset_interval': 200, 'valid_interval': 3000, 'env_info': {'k2-version': '1.24.4', 'k2-build-type': 'Release', 'k2-with-cuda': True, 'k2-git-sha1': '2ad3362f13f7e5a01764ba98fa7cca6fb4cf6afc', 'k2-git-date': 'Sat Oct 21 01:47:40 2023', 'lhotse-version': '1.17.0.dev+git.a470186.clean', 'torch-version': '2.1.0', 'torch-cuda-available': True, 'torch-cuda-version': '12.1', 'python-version': '3.1', 'icefall-git-branch': 'k2ssl', 'icefall-git-sha1': '87843e9-dirty', 'icefall-git-date': 'Fri Apr 5 00:29:16 2024', 'icefall-path': '/mnt/host/icefall', 'k2-path': '/opt/conda/lib/python3.10/site-packages/k2/__init__.py', 'lhotse-path': '/opt/conda/lib/python3.10/site-packages/lhotse/__init__.py', 'hostname': 'd938d3903e1d', 'IP address': '172.17.0.2'}, 'world_size': 4, 'master_port': 12354, 'tensorboard': True, 'num_epochs': 34, 'start_epoch': 33, 'start_batch': 0, 'exp_dir': PosixPath('zipformer/exp2/pretrain'), 'base_lr': 0.045, 'lr_batches': 7500, 'lr_epochs': 10.5, 'warmup_batches': 5000, 'warmup_start': 0, 'ref_duration': 600, 'seed': 42, 'print_diagnostics': False, 'sanity_check': False, 'inf_check': True, 'save_every_n': 100000, 'keep_last_k': 30, 'average_period': 200, 'accum_grad': 2, 'use_fp16': True, 'max_keep_size': 9223372036854775807, 'min_keep_size': 32000, 'max_sample_size': 250000, 'num_encoder_layers': '2,2,3,4,3,2', 'downsampling_factor': '1,2,4,8,4,2', 'feedforward_dim': '512,768,1024,1536,1024,768', 'num_heads': '4,4,4,8,4,4', 'encoder_dim': '192,256,384,512,384,256', 'query_head_dim': '32', 'value_head_dim': '12', 'pos_head_dim': '4', 'pos_dim': 48, 'encoder_unmasked_dim': '192,192,256,256,256,192', 'cnn_module_kernel': '31,31,15,15,15,31', 'label_rate': 50, 'sample_rate': 16000, 'extractor_mode': 'default', 'conv_feature_layers': '[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2', 'conv_bias': False, 'feature_grad_mult': 1.0, 'mask_length': 10, 'mask_prob': 0.65, 'mask_selection': 'static', 'mask_other': 0, 'no_mask_overlap': False, 'mask_min_space': 1, 'mask_channel_length': 10, 'mask_channel_prob': 0.0, 'mask_channel_selection': 'static', 'mask_channel_other': 0, 'no_mask_channel_overlap': False, 'mask_channel_min_space': 1, 'skip_masked': False, 'skip_nomask': False, 'checkpoint_activations': False, 'pred_masked_weight': 1, 'pred_nomask_weight': 0, 'loss_weights': [10], 'required_seq_len_multiple': 2, 'attn_type': '', 'pos_enc_type': 'abs', 'logit_temp': 0.1, 'dropout_input': 0.0, 'dropout_features': 0.0, 'num_classes': [504], 'untie_final_proj': False, 'full_libri': True, 'manifest_dir': PosixPath('data/raw'), 'max_duration': 350.0, 'bucketing_sampler': True, 'num_buckets': 30, 'shuffle': True, 'drop_last': True, 'num_workers': 2, 'do_normalize': True, 'random_crop': True} 2024-04-12 21:49:58,407 INFO [pretrain.py:1117] (0/4) About to create model 2024-04-12 21:49:58,444 INFO [pretrain.py:1121] (3/4) Number of model parameters: 67940861 2024-04-12 21:49:58,445 INFO [checkpoint.py:112] (3/4) Loading checkpoint from zipformer/exp2/pretrain/epoch-32.pt 2024-04-12 21:49:58,551 INFO [pretrain.py:1121] (2/4) Number of model parameters: 67940861 2024-04-12 21:49:58,551 INFO [checkpoint.py:112] (2/4) Loading checkpoint from zipformer/exp2/pretrain/epoch-32.pt 2024-04-12 21:49:58,590 INFO [pretrain.py:1104] (1/4) Training started 2024-04-12 21:49:58,590 INFO [pretrain.py:1114] (1/4) Device: cuda:1 2024-04-12 21:49:58,590 INFO [pretrain.py:1115] (1/4) {'best_train_loss': inf, 'best_valid_loss': inf, 'best_train_epoch': -1, 'best_valid_epoch': -1, 'batch_idx_train': 0, 'sub_batch_idx_train': 0, 'log_interval': 50, 'reset_interval': 200, 'valid_interval': 3000, 'env_info': {'k2-version': '1.24.4', 'k2-build-type': 'Release', 'k2-with-cuda': True, 'k2-git-sha1': '2ad3362f13f7e5a01764ba98fa7cca6fb4cf6afc', 'k2-git-date': 'Sat Oct 21 01:47:40 2023', 'lhotse-version': '1.17.0.dev+git.a470186.clean', 'torch-version': '2.1.0', 'torch-cuda-available': True, 'torch-cuda-version': '12.1', 'python-version': '3.1', 'icefall-git-branch': 'k2ssl', 'icefall-git-sha1': '87843e9-dirty', 'icefall-git-date': 'Fri Apr 5 00:29:16 2024', 'icefall-path': '/mnt/host/icefall', 'k2-path': '/opt/conda/lib/python3.10/site-packages/k2/__init__.py', 'lhotse-path': '/opt/conda/lib/python3.10/site-packages/lhotse/__init__.py', 'hostname': 'd938d3903e1d', 'IP address': '172.17.0.2'}, 'world_size': 4, 'master_port': 12354, 'tensorboard': True, 'num_epochs': 34, 'start_epoch': 33, 'start_batch': 0, 'exp_dir': PosixPath('zipformer/exp2/pretrain'), 'base_lr': 0.045, 'lr_batches': 7500, 'lr_epochs': 10.5, 'warmup_batches': 5000, 'warmup_start': 0, 'ref_duration': 600, 'seed': 42, 'print_diagnostics': False, 'sanity_check': False, 'inf_check': True, 'save_every_n': 100000, 'keep_last_k': 30, 'average_period': 200, 'accum_grad': 2, 'use_fp16': True, 'max_keep_size': 9223372036854775807, 'min_keep_size': 32000, 'max_sample_size': 250000, 'num_encoder_layers': '2,2,3,4,3,2', 'downsampling_factor': '1,2,4,8,4,2', 'feedforward_dim': '512,768,1024,1536,1024,768', 'num_heads': '4,4,4,8,4,4', 'encoder_dim': '192,256,384,512,384,256', 'query_head_dim': '32', 'value_head_dim': '12', 'pos_head_dim': '4', 'pos_dim': 48, 'encoder_unmasked_dim': '192,192,256,256,256,192', 'cnn_module_kernel': '31,31,15,15,15,31', 'label_rate': 50, 'sample_rate': 16000, 'extractor_mode': 'default', 'conv_feature_layers': '[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2', 'conv_bias': False, 'feature_grad_mult': 1.0, 'mask_length': 10, 'mask_prob': 0.65, 'mask_selection': 'static', 'mask_other': 0, 'no_mask_overlap': False, 'mask_min_space': 1, 'mask_channel_length': 10, 'mask_channel_prob': 0.0, 'mask_channel_selection': 'static', 'mask_channel_other': 0, 'no_mask_channel_overlap': False, 'mask_channel_min_space': 1, 'skip_masked': False, 'skip_nomask': False, 'checkpoint_activations': False, 'pred_masked_weight': 1, 'pred_nomask_weight': 0, 'loss_weights': [10], 'required_seq_len_multiple': 2, 'attn_type': '', 'pos_enc_type': 'abs', 'logit_temp': 0.1, 'dropout_input': 0.0, 'dropout_features': 0.0, 'num_classes': [504], 'untie_final_proj': False, 'full_libri': True, 'manifest_dir': PosixPath('data/raw'), 'max_duration': 350.0, 'bucketing_sampler': True, 'num_buckets': 30, 'shuffle': True, 'drop_last': True, 'num_workers': 2, 'do_normalize': True, 'random_crop': True} 2024-04-12 21:49:58,590 INFO [pretrain.py:1117] (1/4) About to create model 2024-04-12 21:49:59,296 INFO [pretrain.py:1121] (0/4) Number of model parameters: 67940861 2024-04-12 21:49:59,406 INFO [pretrain.py:1136] (3/4) Using DDP 2024-04-12 21:49:59,466 INFO [pretrain.py:1121] (1/4) Number of model parameters: 67940861 2024-04-12 21:49:59,466 INFO [checkpoint.py:112] (1/4) Loading checkpoint from zipformer/exp2/pretrain/epoch-32.pt 2024-04-12 21:49:59,490 INFO [pretrain.py:1136] (2/4) Using DDP 2024-04-12 21:49:59,946 INFO [checkpoint.py:112] (0/4) Loading checkpoint from zipformer/exp2/pretrain/epoch-32.pt 2024-04-12 21:50:00,426 INFO [pretrain.py:1136] (1/4) Using DDP 2024-04-12 21:50:00,768 INFO [checkpoint.py:131] (0/4) Loading averaged model 2024-04-12 21:50:01,146 INFO [pretrain.py:1136] (0/4) Using DDP 2024-04-12 21:50:03,126 INFO [pretrain.py:1154] (2/4) Loading optimizer state dict 2024-04-12 21:50:03,155 INFO [pretrain.py:1154] (1/4) Loading optimizer state dict 2024-04-12 21:50:03,158 INFO [pretrain.py:1154] (3/4) Loading optimizer state dict 2024-04-12 21:50:03,232 INFO [pretrain.py:1154] (0/4) Loading optimizer state dict 2024-04-12 21:50:03,240 INFO [pretrain.py:1162] (2/4) Loading scheduler state dict 2024-04-12 21:50:03,276 INFO [pretrain.py:1162] (1/4) Loading scheduler state dict 2024-04-12 21:50:03,289 INFO [ssl_datamodule.py:297] (2/4) About to get the shuffled train-clean-100, train-clean-360 and train-other-500 cuts 2024-04-12 21:50:03,289 INFO [ssl_datamodule.py:276] (2/4) About to get train-clean-100 cuts 2024-04-12 21:50:03,290 INFO [ssl_datamodule.py:283] (2/4) About to get train-clean-360 cuts 2024-04-12 21:50:03,291 INFO [pretrain.py:1162] (3/4) Loading scheduler state dict 2024-04-12 21:50:03,292 INFO [ssl_datamodule.py:290] (2/4) About to get train-other-500 cuts 2024-04-12 21:50:03,293 INFO [ssl_datamodule.py:155] (2/4) About to create train dataset 2024-04-12 21:50:03,293 INFO [ssl_datamodule.py:167] (2/4) Using DynamicBucketingSampler. 2024-04-12 21:50:03,327 INFO [ssl_datamodule.py:297] (1/4) About to get the shuffled train-clean-100, train-clean-360 and train-other-500 cuts 2024-04-12 21:50:03,327 INFO [ssl_datamodule.py:276] (1/4) About to get train-clean-100 cuts 2024-04-12 21:50:03,332 INFO [ssl_datamodule.py:283] (1/4) About to get train-clean-360 cuts 2024-04-12 21:50:03,338 INFO [ssl_datamodule.py:297] (3/4) About to get the shuffled train-clean-100, train-clean-360 and train-other-500 cuts 2024-04-12 21:50:03,338 INFO [ssl_datamodule.py:276] (3/4) About to get train-clean-100 cuts 2024-04-12 21:50:03,344 INFO [ssl_datamodule.py:290] (1/4) About to get train-other-500 cuts 2024-04-12 21:50:03,344 INFO [ssl_datamodule.py:283] (3/4) About to get train-clean-360 cuts 2024-04-12 21:50:03,345 INFO [ssl_datamodule.py:290] (3/4) About to get train-other-500 cuts 2024-04-12 21:50:03,345 INFO [ssl_datamodule.py:155] (1/4) About to create train dataset 2024-04-12 21:50:03,345 INFO [ssl_datamodule.py:167] (1/4) Using DynamicBucketingSampler. 2024-04-12 21:50:03,346 INFO [ssl_datamodule.py:155] (3/4) About to create train dataset 2024-04-12 21:50:03,346 INFO [ssl_datamodule.py:167] (3/4) Using DynamicBucketingSampler. 2024-04-12 21:50:03,350 INFO [pretrain.py:1162] (0/4) Loading scheduler state dict 2024-04-12 21:50:03,395 INFO [ssl_datamodule.py:297] (0/4) About to get the shuffled train-clean-100, train-clean-360 and train-other-500 cuts 2024-04-12 21:50:03,395 INFO [ssl_datamodule.py:276] (0/4) About to get train-clean-100 cuts 2024-04-12 21:50:03,396 INFO [ssl_datamodule.py:283] (0/4) About to get train-clean-360 cuts 2024-04-12 21:50:03,397 INFO [ssl_datamodule.py:290] (0/4) About to get train-other-500 cuts 2024-04-12 21:50:03,398 INFO [ssl_datamodule.py:155] (0/4) About to create train dataset 2024-04-12 21:50:03,398 INFO [ssl_datamodule.py:167] (0/4) Using DynamicBucketingSampler. 2024-04-12 21:50:07,668 INFO [ssl_datamodule.py:182] (0/4) About to create train dataloader 2024-04-12 21:50:07,668 INFO [ssl_datamodule.py:317] (0/4) About to get dev-clean cuts 2024-04-12 21:50:07,680 INFO [ssl_datamodule.py:215] (0/4) About to create dev dataset 2024-04-12 21:50:07,766 INFO [ssl_datamodule.py:182] (2/4) About to create train dataloader 2024-04-12 21:50:07,767 INFO [ssl_datamodule.py:317] (2/4) About to get dev-clean cuts 2024-04-12 21:50:07,768 INFO [ssl_datamodule.py:215] (2/4) About to create dev dataset 2024-04-12 21:50:07,795 INFO [ssl_datamodule.py:182] (3/4) About to create train dataloader 2024-04-12 21:50:07,795 INFO [ssl_datamodule.py:317] (3/4) About to get dev-clean cuts 2024-04-12 21:50:07,796 INFO [ssl_datamodule.py:215] (3/4) About to create dev dataset 2024-04-12 21:50:07,816 INFO [ssl_datamodule.py:182] (1/4) About to create train dataloader 2024-04-12 21:50:07,816 INFO [ssl_datamodule.py:317] (1/4) About to get dev-clean cuts 2024-04-12 21:50:07,819 INFO [ssl_datamodule.py:215] (1/4) About to create dev dataset 2024-04-12 21:50:07,943 INFO [ssl_datamodule.py:230] (0/4) About to create dev dataloader 2024-04-12 21:50:07,944 INFO [pretrain.py:1248] (0/4) Loading grad scaler state dict 2024-04-12 21:50:08,030 INFO [ssl_datamodule.py:230] (2/4) About to create dev dataloader 2024-04-12 21:50:08,031 INFO [pretrain.py:1248] (2/4) Loading grad scaler state dict 2024-04-12 21:50:08,058 INFO [ssl_datamodule.py:230] (3/4) About to create dev dataloader 2024-04-12 21:50:08,058 INFO [pretrain.py:1248] (3/4) Loading grad scaler state dict 2024-04-12 21:50:08,080 INFO [ssl_datamodule.py:230] (1/4) About to create dev dataloader 2024-04-12 21:50:08,080 INFO [pretrain.py:1248] (1/4) Loading grad scaler state dict /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1359: UserWarning: Using a non-full backward hook when the forward contains multiple autograd Nodes is deprecated and will be removed in future versions. This hook will be missing some grad_input. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using a non-full backward hook when the forward contains multiple autograd Nodes " /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1359: UserWarning: Using a non-full backward hook when the forward contains multiple autograd Nodes is deprecated and will be removed in future versions. This hook will be missing some grad_input. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using a non-full backward hook when the forward contains multiple autograd Nodes " /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1359: UserWarning: Using a non-full backward hook when the forward contains multiple autograd Nodes is deprecated and will be removed in future versions. This hook will be missing some grad_input. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using a non-full backward hook when the forward contains multiple autograd Nodes " /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1359: UserWarning: Using a non-full backward hook when the forward contains multiple autograd Nodes is deprecated and will be removed in future versions. This hook will be missing some grad_input. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using a non-full backward hook when the forward contains multiple autograd Nodes " /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1324: UserWarning: Using non-full backward hooks on a Module that does not return a single Tensor or a tuple of Tensors is deprecated and will be removed in future versions. This hook will be missing some of the grad_output. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using non-full backward hooks on a Module that does not return a " /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1324: UserWarning: Using non-full backward hooks on a Module that does not return a single Tensor or a tuple of Tensors is deprecated and will be removed in future versions. This hook will be missing some of the grad_output. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using non-full backward hooks on a Module that does not return a " /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1324: UserWarning: Using non-full backward hooks on a Module that does not return a single Tensor or a tuple of Tensors is deprecated and will be removed in future versions. This hook will be missing some of the grad_output. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using non-full backward hooks on a Module that does not return a " /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1324: UserWarning: Using non-full backward hooks on a Module that does not return a single Tensor or a tuple of Tensors is deprecated and will be removed in future versions. This hook will be missing some of the grad_output. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using non-full backward hooks on a Module that does not return a " /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1359: UserWarning: Using a non-full backward hook when the forward contains multiple autograd Nodes is deprecated and will be removed in future versions. This hook will be missing some grad_input. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using a non-full backward hook when the forward contains multiple autograd Nodes " /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1359: UserWarning: Using a non-full backward hook when the forward contains multiple autograd Nodes is deprecated and will be removed in future versions. This hook will be missing some grad_input. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using a non-full backward hook when the forward contains multiple autograd Nodes " /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1359: UserWarning: Using a non-full backward hook when the forward contains multiple autograd Nodes is deprecated and will be removed in future versions. This hook will be missing some grad_input. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using a non-full backward hook when the forward contains multiple autograd Nodes " /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1359: UserWarning: Using a non-full backward hook when the forward contains multiple autograd Nodes is deprecated and will be removed in future versions. This hook will be missing some grad_input. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using a non-full backward hook when the forward contains multiple autograd Nodes " /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1324: UserWarning: Using non-full backward hooks on a Module that does not return a single Tensor or a tuple of Tensors is deprecated and will be removed in future versions. This hook will be missing some of the grad_output. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using non-full backward hooks on a Module that does not return a " /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1324: UserWarning: Using non-full backward hooks on a Module that does not return a single Tensor or a tuple of Tensors is deprecated and will be removed in future versions. This hook will be missing some of the grad_output. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using non-full backward hooks on a Module that does not return a " /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1324: UserWarning: Using non-full backward hooks on a Module that does not return a single Tensor or a tuple of Tensors is deprecated and will be removed in future versions. This hook will be missing some of the grad_output. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using non-full backward hooks on a Module that does not return a " /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1324: UserWarning: Using non-full backward hooks on a Module that does not return a single Tensor or a tuple of Tensors is deprecated and will be removed in future versions. This hook will be missing some of the grad_output. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using non-full backward hooks on a Module that does not return a " 2024-04-12 21:50:17,417 INFO [checkpoint.py:75] (1/4) Saving checkpoint to zipformer/exp2/pretrain/bad-model-first-warning-1.pt 2024-04-12 21:50:17,417 INFO [checkpoint.py:75] (3/4) Saving checkpoint to zipformer/exp2/pretrain/bad-model-first-warning-3.pt 2024-04-12 21:50:17,424 INFO [checkpoint.py:75] (2/4) Saving checkpoint to zipformer/exp2/pretrain/bad-model-first-warning-2.pt 2024-04-12 21:50:17,774 INFO [checkpoint.py:75] (0/4) Saving checkpoint to zipformer/exp2/pretrain/bad-model-first-warning-0.pt 2024-04-12 21:50:53,586 WARNING [pretrain.py:1024] (2/4) Grad scale is small: 0.0001220703125 2024-04-12 21:50:53,587 INFO [pretrain.py:1035] (2/4) Epoch 33, batch 0, loss[loss=3.135, loss_m_0=3.134, loss_u_0=4.121, loss_features_pen=0.0008038, correct_m_0=0.251, count_m_0=1, correct_u_0=0.3005, count_u_0=1.225, over 7371.00 frames. ], tot_loss[loss=2.983, loss_m_0=2.982, loss_u_0=3.96, loss_features_pen=0.0008277, correct_m_0=0.2836, count_m_0=1, correct_u_0=0.3359, count_u_0=1.249, over 14517.09 frames. ], batch size: 21, lr: 1.06e-02, grad_scale: 0.000244140625 2024-04-12 21:50:53,587 INFO [pretrain.py:1058] (2/4) Computing validation loss 2024-04-12 21:50:53,608 WARNING [pretrain.py:1024] (1/4) Grad scale is small: 0.0001220703125 2024-04-12 21:50:53,608 INFO [pretrain.py:1035] (1/4) Epoch 33, batch 0, loss[loss=2.402, loss_m_0=2.401, loss_u_0=3.029, loss_features_pen=0.0007822, correct_m_0=0.3714, count_m_0=1, correct_u_0=0.4361, count_u_0=1.199, over 7634.00 frames. ], tot_loss[loss=3.012, loss_m_0=3.011, loss_u_0=3.869, loss_features_pen=0.0007972, correct_m_0=0.2866, count_m_0=1, correct_u_0=0.3312, count_u_0=1.216, over 13537.33 frames. ], batch size: 22, lr: 1.06e-02, grad_scale: 0.000244140625 2024-04-12 21:50:53,609 INFO [pretrain.py:1058] (1/4) Computing validation loss 2024-04-12 21:50:53,850 WARNING [pretrain.py:1024] (3/4) Grad scale is small: 0.0001220703125 2024-04-12 21:50:53,851 INFO [pretrain.py:1035] (3/4) Epoch 33, batch 0, loss[loss=2.84, loss_m_0=2.84, loss_u_0=3.848, loss_features_pen=0.0007513, correct_m_0=0.3109, count_m_0=1, correct_u_0=0.3857, count_u_0=1.317, over 7317.00 frames. ], tot_loss[loss=2.881, loss_m_0=2.88, loss_u_0=3.958, loss_features_pen=0.0007751, correct_m_0=0.2857, count_m_0=1, correct_u_0=0.3722, count_u_0=1.326, over 14152.65 frames. ], batch size: 27, lr: 1.06e-02, grad_scale: 0.000244140625 2024-04-12 21:50:53,851 INFO [pretrain.py:1058] (3/4) Computing validation loss 2024-04-12 21:50:55,326 WARNING [pretrain.py:1024] (0/4) Grad scale is small: 0.0001220703125 2024-04-12 21:50:55,327 INFO [pretrain.py:1035] (0/4) Epoch 33, batch 0, loss[loss=2.57, loss_m_0=2.569, loss_u_0=3.685, loss_features_pen=0.0008161, correct_m_0=0.3462, count_m_0=1, correct_u_0=0.4787, count_u_0=1.421, over 6825.00 frames. ], tot_loss[loss=3.035, loss_m_0=3.034, loss_u_0=4.179, loss_features_pen=0.0008255, correct_m_0=0.283, count_m_0=1, correct_u_0=0.3821, count_u_0=1.36, over 13193.99 frames. ], batch size: 25, lr: 1.06e-02, grad_scale: 0.000244140625 2024-04-12 21:50:55,333 INFO [pretrain.py:1058] (0/4) Computing validation loss 2024-04-12 21:51:07,109 INFO [pretrain.py:1066] (2/4) Epoch 33, validation: loss=4.377, loss_m_0=4.376, loss_u_0=5.895, loss_features_pen=0.0008089, correct_m_0=0.1707, count_m_0=1, correct_u_0=0.2369, count_u_0=1.389, over 349395.00 frames. 2024-04-12 21:51:07,109 INFO [pretrain.py:1067] (2/4) Maximum memory allocated so far is 11446MB 2024-04-12 21:51:07,110 INFO [pretrain.py:1066] (3/4) Epoch 33, validation: loss=4.377, loss_m_0=4.376, loss_u_0=5.895, loss_features_pen=0.0008089, correct_m_0=0.1707, count_m_0=1, correct_u_0=0.2369, count_u_0=1.389, over 349395.00 frames. 2024-04-12 21:51:07,110 INFO [pretrain.py:1067] (3/4) Maximum memory allocated so far is 11684MB 2024-04-12 21:51:07,114 INFO [pretrain.py:1066] (1/4) Epoch 33, validation: loss=4.377, loss_m_0=4.376, loss_u_0=5.895, loss_features_pen=0.0008089, correct_m_0=0.1707, count_m_0=1, correct_u_0=0.2369, count_u_0=1.389, over 349395.00 frames. 2024-04-12 21:51:07,114 INFO [pretrain.py:1066] (0/4) Epoch 33, validation: loss=4.377, loss_m_0=4.376, loss_u_0=5.895, loss_features_pen=0.0008089, correct_m_0=0.1707, count_m_0=1, correct_u_0=0.2369, count_u_0=1.389, over 349395.00 frames. 2024-04-12 21:51:07,114 INFO [pretrain.py:1067] (1/4) Maximum memory allocated so far is 11635MB 2024-04-12 21:51:07,115 INFO [pretrain.py:1067] (0/4) Maximum memory allocated so far is 11446MB /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1359: UserWarning: Using a non-full backward hook when the forward contains multiple autograd Nodes is deprecated and will be removed in future versions. This hook will be missing some grad_input. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using a non-full backward hook when the forward contains multiple autograd Nodes " /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1359: UserWarning: Using a non-full backward hook when the forward contains multiple autograd Nodes is deprecated and will be removed in future versions. This hook will be missing some grad_input. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using a non-full backward hook when the forward contains multiple autograd Nodes " /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1359: UserWarning: Using a non-full backward hook when the forward contains multiple autograd Nodes is deprecated and will be removed in future versions. This hook will be missing some grad_input. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using a non-full backward hook when the forward contains multiple autograd Nodes " /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1359: UserWarning: Using a non-full backward hook when the forward contains multiple autograd Nodes is deprecated and will be removed in future versions. This hook will be missing some grad_input. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using a non-full backward hook when the forward contains multiple autograd Nodes " /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1324: UserWarning: Using non-full backward hooks on a Module that does not return a single Tensor or a tuple of Tensors is deprecated and will be removed in future versions. This hook will be missing some of the grad_output. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using non-full backward hooks on a Module that does not return a " /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1324: UserWarning: Using non-full backward hooks on a Module that does not return a single Tensor or a tuple of Tensors is deprecated and will be removed in future versions. This hook will be missing some of the grad_output. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using non-full backward hooks on a Module that does not return a " /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1324: UserWarning: Using non-full backward hooks on a Module that does not return a single Tensor or a tuple of Tensors is deprecated and will be removed in future versions. This hook will be missing some of the grad_output. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using non-full backward hooks on a Module that does not return a " /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1324: UserWarning: Using non-full backward hooks on a Module that does not return a single Tensor or a tuple of Tensors is deprecated and will be removed in future versions. This hook will be missing some of the grad_output. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using non-full backward hooks on a Module that does not return a " /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1359: UserWarning: Using a non-full backward hook when the forward contains multiple autograd Nodes is deprecated and will be removed in future versions. This hook will be missing some grad_input. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using a non-full backward hook when the forward contains multiple autograd Nodes " /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1359: UserWarning: Using a non-full backward hook when the forward contains multiple autograd Nodes is deprecated and will be removed in future versions. This hook will be missing some grad_input. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using a non-full backward hook when the forward contains multiple autograd Nodes " /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1359: UserWarning: Using a non-full backward hook when the forward contains multiple autograd Nodes is deprecated and will be removed in future versions. This hook will be missing some grad_input. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using a non-full backward hook when the forward contains multiple autograd Nodes " /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1359: UserWarning: Using a non-full backward hook when the forward contains multiple autograd Nodes is deprecated and will be removed in future versions. This hook will be missing some grad_input. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using a non-full backward hook when the forward contains multiple autograd Nodes " /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1324: UserWarning: Using non-full backward hooks on a Module that does not return a single Tensor or a tuple of Tensors is deprecated and will be removed in future versions. This hook will be missing some of the grad_output. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using non-full backward hooks on a Module that does not return a " /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1324: UserWarning: Using non-full backward hooks on a Module that does not return a single Tensor or a tuple of Tensors is deprecated and will be removed in future versions. This hook will be missing some of the grad_output. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using non-full backward hooks on a Module that does not return a " /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1324: UserWarning: Using non-full backward hooks on a Module that does not return a single Tensor or a tuple of Tensors is deprecated and will be removed in future versions. This hook will be missing some of the grad_output. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using non-full backward hooks on a Module that does not return a " /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1324: UserWarning: Using non-full backward hooks on a Module that does not return a single Tensor or a tuple of Tensors is deprecated and will be removed in future versions. This hook will be missing some of the grad_output. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using non-full backward hooks on a Module that does not return a " /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1359: UserWarning: Using a non-full backward hook when the forward contains multiple autograd Nodes is deprecated and will be removed in future versions. This hook will be missing some grad_input. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using a non-full backward hook when the forward contains multiple autograd Nodes " /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1359: UserWarning: Using a non-full backward hook when the forward contains multiple autograd Nodes is deprecated and will be removed in future versions. This hook will be missing some grad_input. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using a non-full backward hook when the forward contains multiple autograd Nodes " /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1359: UserWarning: Using a non-full backward hook when the forward contains multiple autograd Nodes is deprecated and will be removed in future versions. This hook will be missing some grad_input. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using a non-full backward hook when the forward contains multiple autograd Nodes " /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1359: UserWarning: Using a non-full backward hook when the forward contains multiple autograd Nodes is deprecated and will be removed in future versions. This hook will be missing some grad_input. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using a non-full backward hook when the forward contains multiple autograd Nodes " /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1324: UserWarning: Using non-full backward hooks on a Module that does not return a single Tensor or a tuple of Tensors is deprecated and will be removed in future versions. This hook will be missing some of the grad_output. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using non-full backward hooks on a Module that does not return a " /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1324: UserWarning: Using non-full backward hooks on a Module that does not return a single Tensor or a tuple of Tensors is deprecated and will be removed in future versions. This hook will be missing some of the grad_output. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using non-full backward hooks on a Module that does not return a " /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1324: UserWarning: Using non-full backward hooks on a Module that does not return a single Tensor or a tuple of Tensors is deprecated and will be removed in future versions. This hook will be missing some of the grad_output. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using non-full backward hooks on a Module that does not return a " /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1324: UserWarning: Using non-full backward hooks on a Module that does not return a single Tensor or a tuple of Tensors is deprecated and will be removed in future versions. This hook will be missing some of the grad_output. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using non-full backward hooks on a Module that does not return a " /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1359: UserWarning: Using a non-full backward hook when the forward contains multiple autograd Nodes is deprecated and will be removed in future versions. This hook will be missing some grad_input. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using a non-full backward hook when the forward contains multiple autograd Nodes " /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1359: UserWarning: Using a non-full backward hook when the forward contains multiple autograd Nodes is deprecated and will be removed in future versions. This hook will be missing some grad_input. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using a non-full backward hook when the forward contains multiple autograd Nodes " /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1359: UserWarning: Using a non-full backward hook when the forward contains multiple autograd Nodes is deprecated and will be removed in future versions. This hook will be missing some grad_input. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using a non-full backward hook when the forward contains multiple autograd Nodes " /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1359: UserWarning: Using a non-full backward hook when the forward contains multiple autograd Nodes is deprecated and will be removed in future versions. This hook will be missing some grad_input. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using a non-full backward hook when the forward contains multiple autograd Nodes " /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1324: UserWarning: Using non-full backward hooks on a Module that does not return a single Tensor or a tuple of Tensors is deprecated and will be removed in future versions. This hook will be missing some of the grad_output. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using non-full backward hooks on a Module that does not return a " /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1324: UserWarning: Using non-full backward hooks on a Module that does not return a single Tensor or a tuple of Tensors is deprecated and will be removed in future versions. This hook will be missing some of the grad_output. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using non-full backward hooks on a Module that does not return a " /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1324: UserWarning: Using non-full backward hooks on a Module that does not return a single Tensor or a tuple of Tensors is deprecated and will be removed in future versions. This hook will be missing some of the grad_output. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using non-full backward hooks on a Module that does not return a " /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1324: UserWarning: Using non-full backward hooks on a Module that does not return a single Tensor or a tuple of Tensors is deprecated and will be removed in future versions. This hook will be missing some of the grad_output. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using non-full backward hooks on a Module that does not return a " /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1359: UserWarning: Using a non-full backward hook when the forward contains multiple autograd Nodes is deprecated and will be removed in future versions. This hook will be missing some grad_input. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using a non-full backward hook when the forward contains multiple autograd Nodes " /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1359: UserWarning: Using a non-full backward hook when the forward contains multiple autograd Nodes is deprecated and will be removed in future versions. This hook will be missing some grad_input. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using a non-full backward hook when the forward contains multiple autograd Nodes " /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1359: UserWarning: Using a non-full backward hook when the forward contains multiple autograd Nodes is deprecated and will be removed in future versions. This hook will be missing some grad_input. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using a non-full backward hook when the forward contains multiple autograd Nodes " /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1359: UserWarning: Using a non-full backward hook when the forward contains multiple autograd Nodes is deprecated and will be removed in future versions. This hook will be missing some grad_input. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using a non-full backward hook when the forward contains multiple autograd Nodes " 2024-04-12 21:51:14,334 INFO [scaling.py:214] (0/4) ScheduledFloat: name=encoder.encoders.0.layers.0.ff2_skip_rate, batch_count=194128.66666666666, ans=0.0 2024-04-12 21:51:14,550 INFO [scaling.py:214] (2/4) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.bypass_mid.scale_min, batch_count=194128.66666666666, ans=0.2 /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1324: UserWarning: Using non-full backward hooks on a Module that does not return a single Tensor or a tuple of Tensors is deprecated and will be removed in future versions. This hook will be missing some of the grad_output. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using non-full backward hooks on a Module that does not return a " /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1324: UserWarning: Using non-full backward hooks on a Module that does not return a single Tensor or a tuple of Tensors is deprecated and will be removed in future versions. This hook will be missing some of the grad_output. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using non-full backward hooks on a Module that does not return a " /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1324: UserWarning: Using non-full backward hooks on a Module that does not return a single Tensor or a tuple of Tensors is deprecated and will be removed in future versions. This hook will be missing some of the grad_output. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using non-full backward hooks on a Module that does not return a " /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1324: UserWarning: Using non-full backward hooks on a Module that does not return a single Tensor or a tuple of Tensors is deprecated and will be removed in future versions. This hook will be missing some of the grad_output. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using non-full backward hooks on a Module that does not return a " /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1359: UserWarning: Using a non-full backward hook when the forward contains multiple autograd Nodes is deprecated and will be removed in future versions. This hook will be missing some grad_input. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using a non-full backward hook when the forward contains multiple autograd Nodes " /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1359: UserWarning: Using a non-full backward hook when the forward contains multiple autograd Nodes is deprecated and will be removed in future versions. This hook will be missing some grad_input. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using a non-full backward hook when the forward contains multiple autograd Nodes " /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1359: UserWarning: Using a non-full backward hook when the forward contains multiple autograd Nodes is deprecated and will be removed in future versions. This hook will be missing some grad_input. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using a non-full backward hook when the forward contains multiple autograd Nodes " /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1359: UserWarning: Using a non-full backward hook when the forward contains multiple autograd Nodes is deprecated and will be removed in future versions. This hook will be missing some grad_input. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using a non-full backward hook when the forward contains multiple autograd Nodes " /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1324: UserWarning: Using non-full backward hooks on a Module that does not return a single Tensor or a tuple of Tensors is deprecated and will be removed in future versions. This hook will be missing some of the grad_output. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using non-full backward hooks on a Module that does not return a " /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1324: UserWarning: Using non-full backward hooks on a Module that does not return a single Tensor or a tuple of Tensors is deprecated and will be removed in future versions. This hook will be missing some of the grad_output. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using non-full backward hooks on a Module that does not return a " /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1324: UserWarning: Using non-full backward hooks on a Module that does not return a single Tensor or a tuple of Tensors is deprecated and will be removed in future versions. This hook will be missing some of the grad_output. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using non-full backward hooks on a Module that does not return a " /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1324: UserWarning: Using non-full backward hooks on a Module that does not return a single Tensor or a tuple of Tensors is deprecated and will be removed in future versions. This hook will be missing some of the grad_output. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using non-full backward hooks on a Module that does not return a " 2024-04-12 21:51:16,588 INFO [scaling.py:1023] (0/4) Whitening: name=encoder.encoders.3.encoder.layers.2.feed_forward1.out_whiten, num_groups=1, num_channels=512, metric=15.32 vs. limit=15.0 /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1359: UserWarning: Using a non-full backward hook when the forward contains multiple autograd Nodes is deprecated and will be removed in future versions. This hook will be missing some grad_input. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using a non-full backward hook when the forward contains multiple autograd Nodes " /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1359: UserWarning: Using a non-full backward hook when the forward contains multiple autograd Nodes is deprecated and will be removed in future versions. This hook will be missing some grad_input. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using a non-full backward hook when the forward contains multiple autograd Nodes " /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1359: UserWarning: Using a non-full backward hook when the forward contains multiple autograd Nodes is deprecated and will be removed in future versions. This hook will be missing some grad_input. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using a non-full backward hook when the forward contains multiple autograd Nodes " /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1359: UserWarning: Using a non-full backward hook when the forward contains multiple autograd Nodes is deprecated and will be removed in future versions. This hook will be missing some grad_input. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using a non-full backward hook when the forward contains multiple autograd Nodes " 2024-04-12 21:51:17,702 INFO [scaling.py:214] (0/4) ScheduledFloat: name=encoder.encoders.3.encoder.layers.2.conv_module1.balancer1.prob, batch_count=194128.66666666666, ans=0.125 2024-04-12 21:51:17,757 INFO [scaling.py:214] (3/4) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.conv_module1.balancer2.prob, batch_count=194128.66666666666, ans=0.125 /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1324: UserWarning: Using non-full backward hooks on a Module that does not return a single Tensor or a tuple of Tensors is deprecated and will be removed in future versions. This hook will be missing some of the grad_output. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using non-full backward hooks on a Module that does not return a " /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1324: UserWarning: Using non-full backward hooks on a Module that does not return a single Tensor or a tuple of Tensors is deprecated and will be removed in future versions. This hook will be missing some of the grad_output. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using non-full backward hooks on a Module that does not return a " /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1324: UserWarning: Using non-full backward hooks on a Module that does not return a single Tensor or a tuple of Tensors is deprecated and will be removed in future versions. This hook will be missing some of the grad_output. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using non-full backward hooks on a Module that does not return a " /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1324: UserWarning: Using non-full backward hooks on a Module that does not return a single Tensor or a tuple of Tensors is deprecated and will be removed in future versions. This hook will be missing some of the grad_output. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using non-full backward hooks on a Module that does not return a " /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1359: UserWarning: Using a non-full backward hook when the forward contains multiple autograd Nodes is deprecated and will be removed in future versions. This hook will be missing some grad_input. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using a non-full backward hook when the forward contains multiple autograd Nodes " /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1359: UserWarning: Using a non-full backward hook when the forward contains multiple autograd Nodes is deprecated and will be removed in future versions. This hook will be missing some grad_input. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using a non-full backward hook when the forward contains multiple autograd Nodes " /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1359: UserWarning: Using a non-full backward hook when the forward contains multiple autograd Nodes is deprecated and will be removed in future versions. This hook will be missing some grad_input. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using a non-full backward hook when the forward contains multiple autograd Nodes " /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1359: UserWarning: Using a non-full backward hook when the forward contains multiple autograd Nodes is deprecated and will be removed in future versions. This hook will be missing some grad_input. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using a non-full backward hook when the forward contains multiple autograd Nodes " /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1324: UserWarning: Using non-full backward hooks on a Module that does not return a single Tensor or a tuple of Tensors is deprecated and will be removed in future versions. This hook will be missing some of the grad_output. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using non-full backward hooks on a Module that does not return a " /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1324: UserWarning: Using non-full backward hooks on a Module that does not return a single Tensor or a tuple of Tensors is deprecated and will be removed in future versions. This hook will be missing some of the grad_output. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using non-full backward hooks on a Module that does not return a " /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1324: UserWarning: Using non-full backward hooks on a Module that does not return a single Tensor or a tuple of Tensors is deprecated and will be removed in future versions. This hook will be missing some of the grad_output. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using non-full backward hooks on a Module that does not return a " /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1324: UserWarning: Using non-full backward hooks on a Module that does not return a single Tensor or a tuple of Tensors is deprecated and will be removed in future versions. This hook will be missing some of the grad_output. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using non-full backward hooks on a Module that does not return a " 2024-04-12 21:51:19,603 INFO [scaling.py:1023] (0/4) Whitening: name=encoder.encoders.3.encoder.layers.2.self_attn_weights.whiten_keys, num_groups=8, num_channels=256, metric=5.96 vs. limit=6.0 /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1359: UserWarning: Using a non-full backward hook when the forward contains multiple autograd Nodes is deprecated and will be removed in future versions. This hook will be missing some grad_input. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using a non-full backward hook when the forward contains multiple autograd Nodes " /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1359: UserWarning: Using a non-full backward hook when the forward contains multiple autograd Nodes is deprecated and will be removed in future versions. This hook will be missing some grad_input. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using a non-full backward hook when the forward contains multiple autograd Nodes " /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1359: UserWarning: Using a non-full backward hook when the forward contains multiple autograd Nodes is deprecated and will be removed in future versions. This hook will be missing some grad_input. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using a non-full backward hook when the forward contains multiple autograd Nodes " /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1359: UserWarning: Using a non-full backward hook when the forward contains multiple autograd Nodes is deprecated and will be removed in future versions. This hook will be missing some grad_input. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using a non-full backward hook when the forward contains multiple autograd Nodes " /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1324: UserWarning: Using non-full backward hooks on a Module that does not return a single Tensor or a tuple of Tensors is deprecated and will be removed in future versions. This hook will be missing some of the grad_output. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using non-full backward hooks on a Module that does not return a " /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1324: UserWarning: Using non-full backward hooks on a Module that does not return a single Tensor or a tuple of Tensors is deprecated and will be removed in future versions. This hook will be missing some of the grad_output. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using non-full backward hooks on a Module that does not return a " /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1324: UserWarning: Using non-full backward hooks on a Module that does not return a single Tensor or a tuple of Tensors is deprecated and will be removed in future versions. This hook will be missing some of the grad_output. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using non-full backward hooks on a Module that does not return a " /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1324: UserWarning: Using non-full backward hooks on a Module that does not return a single Tensor or a tuple of Tensors is deprecated and will be removed in future versions. This hook will be missing some of the grad_output. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using non-full backward hooks on a Module that does not return a " /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1359: UserWarning: Using a non-full backward hook when the forward contains multiple autograd Nodes is deprecated and will be removed in future versions. This hook will be missing some grad_input. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using a non-full backward hook when the forward contains multiple autograd Nodes " /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1359: UserWarning: Using a non-full backward hook when the forward contains multiple autograd Nodes is deprecated and will be removed in future versions. This hook will be missing some grad_input. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using a non-full backward hook when the forward contains multiple autograd Nodes " /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1359: UserWarning: Using a non-full backward hook when the forward contains multiple autograd Nodes is deprecated and will be removed in future versions. This hook will be missing some grad_input. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using a non-full backward hook when the forward contains multiple autograd Nodes " /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1359: UserWarning: Using a non-full backward hook when the forward contains multiple autograd Nodes is deprecated and will be removed in future versions. This hook will be missing some grad_input. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using a non-full backward hook when the forward contains multiple autograd Nodes " /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1324: UserWarning: Using non-full backward hooks on a Module that does not return a single Tensor or a tuple of Tensors is deprecated and will be removed in future versions. This hook will be missing some of the grad_output. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using non-full backward hooks on a Module that does not return a " /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1324: UserWarning: Using non-full backward hooks on a Module that does not return a single Tensor or a tuple of Tensors is deprecated and will be removed in future versions. This hook will be missing some of the grad_output. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using non-full backward hooks on a Module that does not return a " /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1324: UserWarning: Using non-full backward hooks on a Module that does not return a single Tensor or a tuple of Tensors is deprecated and will be removed in future versions. This hook will be missing some of the grad_output. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using non-full backward hooks on a Module that does not return a " /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1324: UserWarning: Using non-full backward hooks on a Module that does not return a single Tensor or a tuple of Tensors is deprecated and will be removed in future versions. This hook will be missing some of the grad_output. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using non-full backward hooks on a Module that does not return a " /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1359: UserWarning: Using a non-full backward hook when the forward contains multiple autograd Nodes is deprecated and will be removed in future versions. This hook will be missing some grad_input. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using a non-full backward hook when the forward contains multiple autograd Nodes " /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1359: UserWarning: Using a non-full backward hook when the forward contains multiple autograd Nodes is deprecated and will be removed in future versions. This hook will be missing some grad_input. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using a non-full backward hook when the forward contains multiple autograd Nodes " /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1359: UserWarning: Using a non-full backward hook when the forward contains multiple autograd Nodes is deprecated and will be removed in future versions. This hook will be missing some grad_input. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using a non-full backward hook when the forward contains multiple autograd Nodes " /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1359: UserWarning: Using a non-full backward hook when the forward contains multiple autograd Nodes is deprecated and will be removed in future versions. This hook will be missing some grad_input. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using a non-full backward hook when the forward contains multiple autograd Nodes " /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1324: UserWarning: Using non-full backward hooks on a Module that does not return a single Tensor or a tuple of Tensors is deprecated and will be removed in future versions. This hook will be missing some of the grad_output. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using non-full backward hooks on a Module that does not return a " /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1324: UserWarning: Using non-full backward hooks on a Module that does not return a single Tensor or a tuple of Tensors is deprecated and will be removed in future versions. This hook will be missing some of the grad_output. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using non-full backward hooks on a Module that does not return a " /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1324: UserWarning: Using non-full backward hooks on a Module that does not return a single Tensor or a tuple of Tensors is deprecated and will be removed in future versions. This hook will be missing some of the grad_output. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using non-full backward hooks on a Module that does not return a " /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1324: UserWarning: Using non-full backward hooks on a Module that does not return a single Tensor or a tuple of Tensors is deprecated and will be removed in future versions. This hook will be missing some of the grad_output. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using non-full backward hooks on a Module that does not return a " /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1359: UserWarning: Using a non-full backward hook when the forward contains multiple autograd Nodes is deprecated and will be removed in future versions. This hook will be missing some grad_input. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using a non-full backward hook when the forward contains multiple autograd Nodes " /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1359: UserWarning: Using a non-full backward hook when the forward contains multiple autograd Nodes is deprecated and will be removed in future versions. This hook will be missing some grad_input. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using a non-full backward hook when the forward contains multiple autograd Nodes " /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1359: UserWarning: Using a non-full backward hook when the forward contains multiple autograd Nodes is deprecated and will be removed in future versions. This hook will be missing some grad_input. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using a non-full backward hook when the forward contains multiple autograd Nodes " /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1359: UserWarning: Using a non-full backward hook when the forward contains multiple autograd Nodes is deprecated and will be removed in future versions. This hook will be missing some grad_input. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using a non-full backward hook when the forward contains multiple autograd Nodes " /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1324: UserWarning: Using non-full backward hooks on a Module that does not return a single Tensor or a tuple of Tensors is deprecated and will be removed in future versions. This hook will be missing some of the grad_output. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using non-full backward hooks on a Module that does not return a " /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1324: UserWarning: Using non-full backward hooks on a Module that does not return a single Tensor or a tuple of Tensors is deprecated and will be removed in future versions. This hook will be missing some of the grad_output. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using non-full backward hooks on a Module that does not return a " /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1324: UserWarning: Using non-full backward hooks on a Module that does not return a single Tensor or a tuple of Tensors is deprecated and will be removed in future versions. This hook will be missing some of the grad_output. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using non-full backward hooks on a Module that does not return a " /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1324: UserWarning: Using non-full backward hooks on a Module that does not return a single Tensor or a tuple of Tensors is deprecated and will be removed in future versions. This hook will be missing some of the grad_output. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using non-full backward hooks on a Module that does not return a " /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1359: UserWarning: Using a non-full backward hook when the forward contains multiple autograd Nodes is deprecated and will be removed in future versions. This hook will be missing some grad_input. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using a non-full backward hook when the forward contains multiple autograd Nodes " /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1359: UserWarning: Using a non-full backward hook when the forward contains multiple autograd Nodes is deprecated and will be removed in future versions. This hook will be missing some grad_input. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using a non-full backward hook when the forward contains multiple autograd Nodes " /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1359: UserWarning: Using a non-full backward hook when the forward contains multiple autograd Nodes is deprecated and will be removed in future versions. This hook will be missing some grad_input. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using a non-full backward hook when the forward contains multiple autograd Nodes " /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1359: UserWarning: Using a non-full backward hook when the forward contains multiple autograd Nodes is deprecated and will be removed in future versions. This hook will be missing some grad_input. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using a non-full backward hook when the forward contains multiple autograd Nodes " /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1324: UserWarning: Using non-full backward hooks on a Module that does not return a single Tensor or a tuple of Tensors is deprecated and will be removed in future versions. This hook will be missing some of the grad_output. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using non-full backward hooks on a Module that does not return a " /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1324: UserWarning: Using non-full backward hooks on a Module that does not return a single Tensor or a tuple of Tensors is deprecated and will be removed in future versions. This hook will be missing some of the grad_output. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using non-full backward hooks on a Module that does not return a " /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1324: UserWarning: Using non-full backward hooks on a Module that does not return a single Tensor or a tuple of Tensors is deprecated and will be removed in future versions. This hook will be missing some of the grad_output. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using non-full backward hooks on a Module that does not return a " /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1324: UserWarning: Using non-full backward hooks on a Module that does not return a single Tensor or a tuple of Tensors is deprecated and will be removed in future versions. This hook will be missing some of the grad_output. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using non-full backward hooks on a Module that does not return a " /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1359: UserWarning: Using a non-full backward hook when the forward contains multiple autograd Nodes is deprecated and will be removed in future versions. This hook will be missing some grad_input. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using a non-full backward hook when the forward contains multiple autograd Nodes " /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1359: UserWarning: Using a non-full backward hook when the forward contains multiple autograd Nodes is deprecated and will be removed in future versions. This hook will be missing some grad_input. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using a non-full backward hook when the forward contains multiple autograd Nodes " /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1359: UserWarning: Using a non-full backward hook when the forward contains multiple autograd Nodes is deprecated and will be removed in future versions. This hook will be missing some grad_input. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using a non-full backward hook when the forward contains multiple autograd Nodes " /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1359: UserWarning: Using a non-full backward hook when the forward contains multiple autograd Nodes is deprecated and will be removed in future versions. This hook will be missing some grad_input. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using a non-full backward hook when the forward contains multiple autograd Nodes " /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1324: UserWarning: Using non-full backward hooks on a Module that does not return a single Tensor or a tuple of Tensors is deprecated and will be removed in future versions. This hook will be missing some of the grad_output. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using non-full backward hooks on a Module that does not return a " /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1324: UserWarning: Using non-full backward hooks on a Module that does not return a single Tensor or a tuple of Tensors is deprecated and will be removed in future versions. This hook will be missing some of the grad_output. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using non-full backward hooks on a Module that does not return a " /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1324: UserWarning: Using non-full backward hooks on a Module that does not return a single Tensor or a tuple of Tensors is deprecated and will be removed in future versions. This hook will be missing some of the grad_output. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using non-full backward hooks on a Module that does not return a " /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1324: UserWarning: Using non-full backward hooks on a Module that does not return a single Tensor or a tuple of Tensors is deprecated and will be removed in future versions. This hook will be missing some of the grad_output. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using non-full backward hooks on a Module that does not return a " /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1359: UserWarning: Using a non-full backward hook when the forward contains multiple autograd Nodes is deprecated and will be removed in future versions. This hook will be missing some grad_input. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using a non-full backward hook when the forward contains multiple autograd Nodes " /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1359: UserWarning: Using a non-full backward hook when the forward contains multiple autograd Nodes is deprecated and will be removed in future versions. This hook will be missing some grad_input. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using a non-full backward hook when the forward contains multiple autograd Nodes " /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1359: UserWarning: Using a non-full backward hook when the forward contains multiple autograd Nodes is deprecated and will be removed in future versions. This hook will be missing some grad_input. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using a non-full backward hook when the forward contains multiple autograd Nodes " /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1359: UserWarning: Using a non-full backward hook when the forward contains multiple autograd Nodes is deprecated and will be removed in future versions. This hook will be missing some grad_input. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using a non-full backward hook when the forward contains multiple autograd Nodes " /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1324: UserWarning: Using non-full backward hooks on a Module that does not return a single Tensor or a tuple of Tensors is deprecated and will be removed in future versions. This hook will be missing some of the grad_output. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using non-full backward hooks on a Module that does not return a " /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1324: UserWarning: Using non-full backward hooks on a Module that does not return a single Tensor or a tuple of Tensors is deprecated and will be removed in future versions. This hook will be missing some of the grad_output. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using non-full backward hooks on a Module that does not return a " /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1324: UserWarning: Using non-full backward hooks on a Module that does not return a single Tensor or a tuple of Tensors is deprecated and will be removed in future versions. This hook will be missing some of the grad_output. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using non-full backward hooks on a Module that does not return a " /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1324: UserWarning: Using non-full backward hooks on a Module that does not return a single Tensor or a tuple of Tensors is deprecated and will be removed in future versions. This hook will be missing some of the grad_output. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using non-full backward hooks on a Module that does not return a " /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1359: UserWarning: Using a non-full backward hook when the forward contains multiple autograd Nodes is deprecated and will be removed in future versions. This hook will be missing some grad_input. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using a non-full backward hook when the forward contains multiple autograd Nodes " /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1359: UserWarning: Using a non-full backward hook when the forward contains multiple autograd Nodes is deprecated and will be removed in future versions. This hook will be missing some grad_input. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using a non-full backward hook when the forward contains multiple autograd Nodes " /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1359: UserWarning: Using a non-full backward hook when the forward contains multiple autograd Nodes is deprecated and will be removed in future versions. This hook will be missing some grad_input. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using a non-full backward hook when the forward contains multiple autograd Nodes " /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1359: UserWarning: Using a non-full backward hook when the forward contains multiple autograd Nodes is deprecated and will be removed in future versions. This hook will be missing some grad_input. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using a non-full backward hook when the forward contains multiple autograd Nodes " /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1324: UserWarning: Using non-full backward hooks on a Module that does not return a single Tensor or a tuple of Tensors is deprecated and will be removed in future versions. This hook will be missing some of the grad_output. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using non-full backward hooks on a Module that does not return a " /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1324: UserWarning: Using non-full backward hooks on a Module that does not return a single Tensor or a tuple of Tensors is deprecated and will be removed in future versions. This hook will be missing some of the grad_output. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using non-full backward hooks on a Module that does not return a " /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1324: UserWarning: Using non-full backward hooks on a Module that does not return a single Tensor or a tuple of Tensors is deprecated and will be removed in future versions. This hook will be missing some of the grad_output. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using non-full backward hooks on a Module that does not return a " /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1324: UserWarning: Using non-full backward hooks on a Module that does not return a single Tensor or a tuple of Tensors is deprecated and will be removed in future versions. This hook will be missing some of the grad_output. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using non-full backward hooks on a Module that does not return a " 2024-04-12 21:51:31,691 INFO [scaling.py:1023] (0/4) Whitening: name=encoder.encoders.3.encoder.layers.2.nonlin_attention.whiten2, num_groups=1, num_channels=512, metric=17.02 vs. limit=15.0 2024-04-12 21:51:31,722 INFO [scaling.py:1023] (2/4) Whitening: name=encoder.encoders.3.encoder.layers.0.feed_forward2.out_whiten, num_groups=1, num_channels=512, metric=17.41 vs. limit=15.0 /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1359: UserWarning: Using a non-full backward hook when the forward contains multiple autograd Nodes is deprecated and will be removed in future versions. This hook will be missing some grad_input. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using a non-full backward hook when the forward contains multiple autograd Nodes " /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1359: UserWarning: Using a non-full backward hook when the forward contains multiple autograd Nodes is deprecated and will be removed in future versions. This hook will be missing some grad_input. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using a non-full backward hook when the forward contains multiple autograd Nodes " /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1359: UserWarning: Using a non-full backward hook when the forward contains multiple autograd Nodes is deprecated and will be removed in future versions. This hook will be missing some grad_input. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using a non-full backward hook when the forward contains multiple autograd Nodes " /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1359: UserWarning: Using a non-full backward hook when the forward contains multiple autograd Nodes is deprecated and will be removed in future versions. This hook will be missing some grad_input. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using a non-full backward hook when the forward contains multiple autograd Nodes " /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1324: UserWarning: Using non-full backward hooks on a Module that does not return a single Tensor or a tuple of Tensors is deprecated and will be removed in future versions. This hook will be missing some of the grad_output. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using non-full backward hooks on a Module that does not return a " /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1324: UserWarning: Using non-full backward hooks on a Module that does not return a single Tensor or a tuple of Tensors is deprecated and will be removed in future versions. This hook will be missing some of the grad_output. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using non-full backward hooks on a Module that does not return a " /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1324: UserWarning: Using non-full backward hooks on a Module that does not return a single Tensor or a tuple of Tensors is deprecated and will be removed in future versions. This hook will be missing some of the grad_output. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using non-full backward hooks on a Module that does not return a " /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1324: UserWarning: Using non-full backward hooks on a Module that does not return a single Tensor or a tuple of Tensors is deprecated and will be removed in future versions. This hook will be missing some of the grad_output. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using non-full backward hooks on a Module that does not return a " /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1359: UserWarning: Using a non-full backward hook when the forward contains multiple autograd Nodes is deprecated and will be removed in future versions. This hook will be missing some grad_input. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using a non-full backward hook when the forward contains multiple autograd Nodes " /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1359: UserWarning: Using a non-full backward hook when the forward contains multiple autograd Nodes is deprecated and will be removed in future versions. This hook will be missing some grad_input. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using a non-full backward hook when the forward contains multiple autograd Nodes " /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1359: UserWarning: Using a non-full backward hook when the forward contains multiple autograd Nodes is deprecated and will be removed in future versions. This hook will be missing some grad_input. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using a non-full backward hook when the forward contains multiple autograd Nodes " /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1359: UserWarning: Using a non-full backward hook when the forward contains multiple autograd Nodes is deprecated and will be removed in future versions. This hook will be missing some grad_input. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using a non-full backward hook when the forward contains multiple autograd Nodes " 2024-04-12 21:51:34,382 INFO [scaling.py:214] (2/4) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.conv_skip_rate, batch_count=194128.66666666666, ans=0.0 /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1324: UserWarning: Using non-full backward hooks on a Module that does not return a single Tensor or a tuple of Tensors is deprecated and will be removed in future versions. This hook will be missing some of the grad_output. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using non-full backward hooks on a Module that does not return a " /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1324: UserWarning: Using non-full backward hooks on a Module that does not return a single Tensor or a tuple of Tensors is deprecated and will be removed in future versions. This hook will be missing some of the grad_output. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using non-full backward hooks on a Module that does not return a " /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1324: UserWarning: Using non-full backward hooks on a Module that does not return a single Tensor or a tuple of Tensors is deprecated and will be removed in future versions. This hook will be missing some of the grad_output. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using non-full backward hooks on a Module that does not return a " /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1324: UserWarning: Using non-full backward hooks on a Module that does not return a single Tensor or a tuple of Tensors is deprecated and will be removed in future versions. This hook will be missing some of the grad_output. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using non-full backward hooks on a Module that does not return a " /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1359: UserWarning: Using a non-full backward hook when the forward contains multiple autograd Nodes is deprecated and will be removed in future versions. This hook will be missing some grad_input. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using a non-full backward hook when the forward contains multiple autograd Nodes " /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1359: UserWarning: Using a non-full backward hook when the forward contains multiple autograd Nodes is deprecated and will be removed in future versions. This hook will be missing some grad_input. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using a non-full backward hook when the forward contains multiple autograd Nodes " /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1359: UserWarning: Using a non-full backward hook when the forward contains multiple autograd Nodes is deprecated and will be removed in future versions. This hook will be missing some grad_input. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using a non-full backward hook when the forward contains multiple autograd Nodes " /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1359: UserWarning: Using a non-full backward hook when the forward contains multiple autograd Nodes is deprecated and will be removed in future versions. This hook will be missing some grad_input. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using a non-full backward hook when the forward contains multiple autograd Nodes " /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1324: UserWarning: Using non-full backward hooks on a Module that does not return a single Tensor or a tuple of Tensors is deprecated and will be removed in future versions. This hook will be missing some of the grad_output. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using non-full backward hooks on a Module that does not return a " /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1324: UserWarning: Using non-full backward hooks on a Module that does not return a single Tensor or a tuple of Tensors is deprecated and will be removed in future versions. This hook will be missing some of the grad_output. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using non-full backward hooks on a Module that does not return a " /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1324: UserWarning: Using non-full backward hooks on a Module that does not return a single Tensor or a tuple of Tensors is deprecated and will be removed in future versions. This hook will be missing some of the grad_output. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using non-full backward hooks on a Module that does not return a " /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1324: UserWarning: Using non-full backward hooks on a Module that does not return a single Tensor or a tuple of Tensors is deprecated and will be removed in future versions. This hook will be missing some of the grad_output. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using non-full backward hooks on a Module that does not return a " 2024-04-12 21:51:36,319 INFO [scaling.py:214] (1/4) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.feed_forward1.out_whiten.whitening_limit, batch_count=194175.33333333334, ans=15.0 /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1359: UserWarning: Using a non-full backward hook when the forward contains multiple autograd Nodes is deprecated and will be removed in future versions. This hook will be missing some grad_input. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using a non-full backward hook when the forward contains multiple autograd Nodes " /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1359: UserWarning: Using a non-full backward hook when the forward contains multiple autograd Nodes is deprecated and will be removed in future versions. This hook will be missing some grad_input. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using a non-full backward hook when the forward contains multiple autograd Nodes " /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1359: UserWarning: Using a non-full backward hook when the forward contains multiple autograd Nodes is deprecated and will be removed in future versions. This hook will be missing some grad_input. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using a non-full backward hook when the forward contains multiple autograd Nodes " /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1359: UserWarning: Using a non-full backward hook when the forward contains multiple autograd Nodes is deprecated and will be removed in future versions. This hook will be missing some grad_input. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using a non-full backward hook when the forward contains multiple autograd Nodes " 2024-04-12 21:51:37,132 INFO [scaling.py:214] (3/4) ScheduledFloat: name=encoder.encoders.0.layers.1.feed_forward1.out_proj.dropout_p, batch_count=194175.33333333334, ans=0.1 /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1324: UserWarning: Using non-full backward hooks on a Module that does not return a single Tensor or a tuple of Tensors is deprecated and will be removed in future versions. This hook will be missing some of the grad_output. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using non-full backward hooks on a Module that does not return a " /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1324: UserWarning: Using non-full backward hooks on a Module that does not return a single Tensor or a tuple of Tensors is deprecated and will be removed in future versions. This hook will be missing some of the grad_output. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using non-full backward hooks on a Module that does not return a " /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1324: UserWarning: Using non-full backward hooks on a Module that does not return a single Tensor or a tuple of Tensors is deprecated and will be removed in future versions. This hook will be missing some of the grad_output. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using non-full backward hooks on a Module that does not return a " /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1324: UserWarning: Using non-full backward hooks on a Module that does not return a single Tensor or a tuple of Tensors is deprecated and will be removed in future versions. This hook will be missing some of the grad_output. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using non-full backward hooks on a Module that does not return a " 2024-04-12 21:51:37,759 INFO [scaling.py:1023] (2/4) Whitening: name=encoder.encoders.3.encoder.layers.2.feed_forward2.out_whiten, num_groups=1, num_channels=512, metric=19.11 vs. limit=15.0 /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1359: UserWarning: Using a non-full backward hook when the forward contains multiple autograd Nodes is deprecated and will be removed in future versions. This hook will be missing some grad_input. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using a non-full backward hook when the forward contains multiple autograd Nodes " /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1359: UserWarning: Using a non-full backward hook when the forward contains multiple autograd Nodes is deprecated and will be removed in future versions. This hook will be missing some grad_input. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using a non-full backward hook when the forward contains multiple autograd Nodes " /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1359: UserWarning: Using a non-full backward hook when the forward contains multiple autograd Nodes is deprecated and will be removed in future versions. This hook will be missing some grad_input. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using a non-full backward hook when the forward contains multiple autograd Nodes " /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1359: UserWarning: Using a non-full backward hook when the forward contains multiple autograd Nodes is deprecated and will be removed in future versions. This hook will be missing some grad_input. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using a non-full backward hook when the forward contains multiple autograd Nodes " /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1324: UserWarning: Using non-full backward hooks on a Module that does not return a single Tensor or a tuple of Tensors is deprecated and will be removed in future versions. This hook will be missing some of the grad_output. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using non-full backward hooks on a Module that does not return a " /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1324: UserWarning: Using non-full backward hooks on a Module that does not return a single Tensor or a tuple of Tensors is deprecated and will be removed in future versions. This hook will be missing some of the grad_output. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using non-full backward hooks on a Module that does not return a " /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1324: UserWarning: Using non-full backward hooks on a Module that does not return a single Tensor or a tuple of Tensors is deprecated and will be removed in future versions. This hook will be missing some of the grad_output. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using non-full backward hooks on a Module that does not return a " /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1324: UserWarning: Using non-full backward hooks on a Module that does not return a single Tensor or a tuple of Tensors is deprecated and will be removed in future versions. This hook will be missing some of the grad_output. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using non-full backward hooks on a Module that does not return a " /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1359: UserWarning: Using a non-full backward hook when the forward contains multiple autograd Nodes is deprecated and will be removed in future versions. This hook will be missing some grad_input. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using a non-full backward hook when the forward contains multiple autograd Nodes " /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1359: UserWarning: Using a non-full backward hook when the forward contains multiple autograd Nodes is deprecated and will be removed in future versions. This hook will be missing some grad_input. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using a non-full backward hook when the forward contains multiple autograd Nodes " /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1359: UserWarning: Using a non-full backward hook when the forward contains multiple autograd Nodes is deprecated and will be removed in future versions. This hook will be missing some grad_input. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using a non-full backward hook when the forward contains multiple autograd Nodes " /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1359: UserWarning: Using a non-full backward hook when the forward contains multiple autograd Nodes is deprecated and will be removed in future versions. This hook will be missing some grad_input. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using a non-full backward hook when the forward contains multiple autograd Nodes " /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1324: UserWarning: Using non-full backward hooks on a Module that does not return a single Tensor or a tuple of Tensors is deprecated and will be removed in future versions. This hook will be missing some of the grad_output. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using non-full backward hooks on a Module that does not return a " /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1324: UserWarning: Using non-full backward hooks on a Module that does not return a single Tensor or a tuple of Tensors is deprecated and will be removed in future versions. This hook will be missing some of the grad_output. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using non-full backward hooks on a Module that does not return a " /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1324: UserWarning: Using non-full backward hooks on a Module that does not return a single Tensor or a tuple of Tensors is deprecated and will be removed in future versions. This hook will be missing some of the grad_output. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using non-full backward hooks on a Module that does not return a " /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1324: UserWarning: Using non-full backward hooks on a Module that does not return a single Tensor or a tuple of Tensors is deprecated and will be removed in future versions. This hook will be missing some of the grad_output. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using non-full backward hooks on a Module that does not return a " /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1359: UserWarning: Using a non-full backward hook when the forward contains multiple autograd Nodes is deprecated and will be removed in future versions. This hook will be missing some grad_input. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using a non-full backward hook when the forward contains multiple autograd Nodes " /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1359: UserWarning: Using a non-full backward hook when the forward contains multiple autograd Nodes is deprecated and will be removed in future versions. This hook will be missing some grad_input. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using a non-full backward hook when the forward contains multiple autograd Nodes " /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1359: UserWarning: Using a non-full backward hook when the forward contains multiple autograd Nodes is deprecated and will be removed in future versions. This hook will be missing some grad_input. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using a non-full backward hook when the forward contains multiple autograd Nodes " /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1359: UserWarning: Using a non-full backward hook when the forward contains multiple autograd Nodes is deprecated and will be removed in future versions. This hook will be missing some grad_input. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using a non-full backward hook when the forward contains multiple autograd Nodes " /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1324: UserWarning: Using non-full backward hooks on a Module that does not return a single Tensor or a tuple of Tensors is deprecated and will be removed in future versions. This hook will be missing some of the grad_output. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using non-full backward hooks on a Module that does not return a " /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1324: UserWarning: Using non-full backward hooks on a Module that does not return a single Tensor or a tuple of Tensors is deprecated and will be removed in future versions. This hook will be missing some of the grad_output. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using non-full backward hooks on a Module that does not return a " /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1324: UserWarning: Using non-full backward hooks on a Module that does not return a single Tensor or a tuple of Tensors is deprecated and will be removed in future versions. This hook will be missing some of the grad_output. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using non-full backward hooks on a Module that does not return a " /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1324: UserWarning: Using non-full backward hooks on a Module that does not return a single Tensor or a tuple of Tensors is deprecated and will be removed in future versions. This hook will be missing some of the grad_output. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using non-full backward hooks on a Module that does not return a " 2024-04-12 21:51:42,720 INFO [scaling.py:1023] (2/4) Whitening: name=encoder.encoders.3.encoder.layers.3.feed_forward2.out_whiten, num_groups=1, num_channels=512, metric=16.76 vs. limit=15.0 /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1359: UserWarning: Using a non-full backward hook when the forward contains multiple autograd Nodes is deprecated and will be removed in future versions. This hook will be missing some grad_input. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using a non-full backward hook when the forward contains multiple autograd Nodes " /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1359: UserWarning: Using a non-full backward hook when the forward contains multiple autograd Nodes is deprecated and will be removed in future versions. This hook will be missing some grad_input. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using a non-full backward hook when the forward contains multiple autograd Nodes " /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1359: UserWarning: Using a non-full backward hook when the forward contains multiple autograd Nodes is deprecated and will be removed in future versions. This hook will be missing some grad_input. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using a non-full backward hook when the forward contains multiple autograd Nodes " /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1359: UserWarning: Using a non-full backward hook when the forward contains multiple autograd Nodes is deprecated and will be removed in future versions. This hook will be missing some grad_input. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using a non-full backward hook when the forward contains multiple autograd Nodes " 2024-04-12 21:51:43,572 INFO [checkpoint.py:75] (2/4) Saving checkpoint to zipformer/exp2/pretrain/bad-model-2.pt /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1324: UserWarning: Using non-full backward hooks on a Module that does not return a single Tensor or a tuple of Tensors is deprecated and will be removed in future versions. This hook will be missing some of the grad_output. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using non-full backward hooks on a Module that does not return a " /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1324: UserWarning: Using non-full backward hooks on a Module that does not return a single Tensor or a tuple of Tensors is deprecated and will be removed in future versions. This hook will be missing some of the grad_output. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using non-full backward hooks on a Module that does not return a " /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1324: UserWarning: Using non-full backward hooks on a Module that does not return a single Tensor or a tuple of Tensors is deprecated and will be removed in future versions. This hook will be missing some of the grad_output. Please use register_full_backward_hook to get the documented behavior. warnings.warn("Using non-full backward hooks on a Module that does not return a " 2024-04-12 21:51:52,462 INFO [pretrain.py:1315] (2/4) Saving batch to zipformer/exp2/pretrain/batch-de04cece-83e9-60e4-9c51-e692dc1729ca.pt 2024-04-12 21:51:52,777 INFO [pretrain.py:1319] (2/4) audio shape: torch.Size([22, 242480]) Traceback (most recent call last): File "/mnt/host/icefall-k2ssl/egs/librispeech/SSL/zipformer/pretrain.py", line 1380, in main() File "/mnt/host/icefall-k2ssl/egs/librispeech/SSL/zipformer/pretrain.py", line 1371, in main mp.spawn(run, args=(world_size, args), nprocs=world_size, join=True) File "/opt/conda/lib/python3.10/site-packages/torch/multiprocessing/spawn.py", line 246, in spawn return start_processes(fn, args, nprocs, join, daemon, start_method="spawn") File "/opt/conda/lib/python3.10/site-packages/torch/multiprocessing/spawn.py", line 202, in start_processes while not context.join(): File "/opt/conda/lib/python3.10/site-packages/torch/multiprocessing/spawn.py", line 163, in join raise ProcessRaisedException(msg, error_index, failed_process.pid) torch.multiprocessing.spawn.ProcessRaisedException: -- Process 2 terminated with the following error: Traceback (most recent call last): File "/opt/conda/lib/python3.10/site-packages/torch/multiprocessing/spawn.py", line 74, in _wrap fn(i, *args) File "/mnt/host/icefall-k2ssl/egs/librispeech/SSL/zipformer/pretrain.py", line 1261, in run train_one_epoch( File "/mnt/host/icefall-k2ssl/egs/librispeech/SSL/zipformer/pretrain.py", line 948, in train_one_epoch loss, loss_info = compute_loss( File "/mnt/host/icefall-k2ssl/egs/librispeech/SSL/zipformer/pretrain.py", line 827, in compute_loss loss, num_masked_tokens, logging_output = model( File "/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl return self._call_impl(*args, **kwargs) File "/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1568, in _call_impl result = forward_call(*args, **kwargs) File "/opt/conda/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 1519, in forward else self._run_ddp_forward(*inputs, **kwargs) File "/opt/conda/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 1355, in _run_ddp_forward return self.module(*inputs, **kwargs) # type: ignore[index] File "/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl return self._call_impl(*args, **kwargs) File "/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1568, in _call_impl result = forward_call(*args, **kwargs) File "/mnt/host/icefall-k2ssl/egs/librispeech/SSL/zipformer/hubert_ce.py", line 401, in forward features = self.forward_features(source) File "/mnt/host/icefall-k2ssl/egs/librispeech/SSL/zipformer/hubert_ce.py", line 356, in forward_features features = self.feature_extractor(source) File "/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl return self._call_impl(*args, **kwargs) File "/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1568, in _call_impl result = forward_call(*args, **kwargs) File "/mnt/host/icefall-k2ssl/egs/librispeech/SSL/zipformer/wav2vec2_module.py", line 106, in forward x = conv(x) File "/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl return self._call_impl(*args, **kwargs) File "/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1568, in _call_impl result = forward_call(*args, **kwargs) File "/opt/conda/lib/python3.10/site-packages/torch/nn/modules/container.py", line 215, in forward input = module(input) File "/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl return self._call_impl(*args, **kwargs) File "/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1581, in _call_impl hook_result = hook(self, args, result) File "/workspace/icefall/icefall/hooks.py", line 41, in forward_hook raise ValueError( ValueError: The sum of module.feature_extractor.conv_layers.2.0.output is not finite: tensor([[[ -6.5234, -6.5078, -6.6094, ..., -6.5820, -6.5469, -6.5469], [ -0.7900, -0.6479, -0.5444, ..., -0.9287, -0.9971, -0.9380], [ -7.3672, -8.1250, -8.5938, ..., -7.8672, -7.9023, -7.8047], ..., [ -6.7461, -6.7930, -6.7305, ..., -6.7852, -6.7773, -6.7500], [ -8.1016, -8.3359, -8.0469, ..., -8.0000, -7.9922, -8.0625], [ -0.6177, -0.8335, -1.2656, ..., -1.1338, -1.4365, -0.8394]], [[ -25.1094, -21.8438, -18.2188, ..., -17.1562, -21.2188, -18.9844], [ -19.5312, -32.8438, -24.5938, ..., -19.6250, -23.6094, -26.1875], [ -48.7812, -36.7500, -33.6875, ..., -35.3750, -34.1250, -35.0312], ..., [ -25.5625, -20.8906, -21.9531, ..., -18.2188, -22.1406, -19.6250], [ -27.6406, -31.2969, -18.5156, ..., -26.4062, -28.8594, -26.9531], [ -35.9375, -22.6875, -12.6406, ..., -19.3125, -5.4219, -1.5449]], [[ -43.4062, -45.7812, -43.2188, ..., -39.5000, -40.6562, -43.9688], [-101.8125, -67.1875, -111.1875, ..., -81.5000, -66.1250, -98.4375], [-102.6250, -137.8750, -97.6250, ..., -94.0625, -115.0625, -134.8750], ..., [ -51.2812, -51.9688, -48.9062, ..., -49.3438, -61.2500, -54.4062], [ -70.8125, -69.8750, -66.2500, ..., -72.7500, -62.0625, -67.9375], [ -68.2500, -61.9688, -68.4375, ..., -35.6250, -74.7500, -105.8125]], ..., [[ -9.2344, -8.1250, -8.6094, ..., -10.2656, -10.9688, -12.4297], [ -6.8398, -8.8047, -2.9531, ..., -8.2188, -12.0156, -16.6719], [ -14.3984, -8.7969, -16.3438, ..., -12.7344, -16.0156, -20.2812], ..., [ -9.1562, -8.1797, -10.4531, ..., -12.1172, -11.1094, -12.8984], [ -12.2656, -9.3828, -9.7266, ..., -10.5312, -14.5859, -14.6016], [ -4.7148, -3.5703, -7.8320, ..., -0.4092, -7.0117, -14.4219]], [[ -8.0781, -9.1797, -8.2031, ..., -9.4297, -8.1875, -9.3125], [ -3.7168, -7.8516, -2.6133, ..., -4.3164, -5.3242, -10.1484], [ -9.2344, -12.7109, -11.4844, ..., -10.1875, -13.3594, -15.8203], ..., [ -8.0156, -8.3828, -8.6094, ..., -9.3281, -8.1953, -9.4766], [ -8.2266, -10.6094, -9.3203, ..., -9.2266, -9.0312, -12.1953], [ -2.7793, -6.1484, -5.0000, ..., -1.8271, -5.4883, -2.3711]], [[ -6.5859, -6.5859, -6.5859, ..., -6.5859, -6.5859, -6.5859], [ -0.5088, -0.5088, -0.5088, ..., -0.5088, -0.5088, -0.5088], [ -8.0938, -8.0938, -8.0938, ..., -8.0938, -8.0938, -8.0938], ..., [ -6.7617, -6.7617, -6.7617, ..., -6.7617, -6.7617, -6.7617], [ -7.9688, -7.9688, -7.9688, ..., -7.9688, -7.9688, -7.9688], [ -1.0635, -1.0635, -1.0635, ..., -1.0635, -1.0635, -1.0635]]], device='cuda:2', dtype=torch.float16, grad_fn=)