-- Loading model
 -- Tokenizer: /home/john/Projects/Python/text-models/text-generation-webui/models/TheBloke_guanaco-33B-GPTQ/tokenizer.model
 -- Model config: /home/john/Projects/Python/text-models/text-generation-webui/models/TheBloke_guanaco-33B-GPTQ/config.json
 -- Model: /home/john/Projects/Python/text-models/text-generation-webui/models/TheBloke_guanaco-33B-GPTQ/Guanaco-33B-GPTQ-4bit.act-order.safetensors
 -- Sequence length: 2048
 -- Options: ['attention: pytorch_scaled_dp', 'matmul: switched', 'mlp: normal', 'perplexity', 'validate', 'debug', 'gpu_split: 4,20']
 !! Available CUDA devices:
" !!  - cuda:0: NVIDIA GeForce RTX 4090
" !!  - cuda:1: NVIDIA GeForce RTX 4090
 !! Loading safetensors file: /home/john/Projects/Python/text-models/text-generation-webui/models/TheBloke_guanaco-33B-GPTQ/Guanaco-33B-GPTQ-4bit.act-order.safetensors
 !! Begin auto device map
 !! Decoder size: 267,855,616 bytes
 !! Decoder size, DQ: 1,070,098,432 bytes
 !! Norm size: 13,312 bytes
 !! Head size: 425,984,000 bytes
 !! Device map:
 !!  - embed_tokens: cpu
 !!  - layers [0:10]: cuda:0 cuda:0 cuda:0 cuda:0 cuda:0 cuda:0 cuda:0 cuda:0 cuda:0 cuda:0
 !!  - layers [10:20]: cuda:0 cuda:0 cuda:0 cuda:0 cuda:0 cuda:0 cuda:1 cuda:1 cuda:1 cuda:1
 !!  - layers [20:30]: cuda:1 cuda:1 cuda:1 cuda:1 cuda:1 cuda:1 cuda:1 cuda:1 cuda:1 cuda:1
 !!  - layers [30:40]: cuda:1 cuda:1 cuda:1 cuda:1 cuda:1 cuda:1 cuda:1 cuda:1 cuda:1 cuda:1
 !!  - layers [40:50]: cuda:1 cuda:1 cuda:1 cuda:1 cuda:1 cuda:1 cuda:1 cuda:1 cuda:1 cuda:1
 !!  - layers [50:60]: cuda:1 cuda:1 cuda:1 cuda:1 cuda:1 cuda:1 cuda:1 cuda:1 cuda:1 cuda:1
 !!  - norm: cuda:1
 !!  - lm_head: cuda:1
 !! Begin load tensors
 !!  - lm_head.weight read: device: cpu, shape: [32000, 6656], dtype: bfloat16
 !!  - lm_head.weight map: device: cuda:1, shape: [32000, 6656], dtype: float16, min: -0.398438, max: 0.335938, std: 0.018646
 !!  - model.embed_tokens.weight read: device: cpu, shape: [32000, 6656], dtype: bfloat16
 !!  - model.embed_tokens.weight map: device: cpu, shape: [32000, 6656], dtype: float16
 !!  - model.layers.0.input_layernorm.weight read: device: cpu, shape: [6656], dtype: bfloat16
 !!  - model.layers.0.input_layernorm.weight map: device: cuda:0, shape: [6656], dtype: float16, min: 0.000099, max: 0.431641, std: 0.028687
 !!  - model.layers.0.mlp.down_proj.qweight read: device: cpu, shape: [2240, 6656], dtype: int32
 !!  - model.layers.0.mlp.down_proj.qweight map: device: cuda:0, shape: [2240, 6656], dtype: int32, min: -2146138552, max: 2146069173
 !!  - model.layers.0.mlp.down_proj.qzeros read: device: cpu, shape: [1, 832], dtype: int32
 !!  - model.layers.0.mlp.down_proj.qzeros map: device: cuda:0, shape: [1, 832], dtype: int32, min: -2040174985, max: 2021090934
 !!  - model.layers.0.mlp.down_proj.scales read: device: cpu, shape: [1, 6656], dtype: float32
 !!  - model.layers.0.mlp.down_proj.scales map: device: cuda:0, shape: [1, 6656], dtype: float16, min: 0.006851, max: 0.076843, std: 0.001938
 !!  - model.layers.0.mlp.gate_proj.qweight read: device: cpu, shape: [832, 17920], dtype: int32
 !!  - model.layers.0.mlp.gate_proj.qweight map: device: cuda:0, shape: [832, 17920], dtype: int32, min: -2146081160, max: 2145818758
 !!  - model.layers.0.mlp.gate_proj.qzeros read: device: cpu, shape: [1, 2240], dtype: int32
 !!  - model.layers.0.mlp.gate_proj.qzeros map: device: cuda:0, shape: [1, 2240], dtype: int32, min: -2056882553, max: 2022205030
 !!  - model.layers.0.mlp.gate_proj.scales read: device: cpu, shape: [1, 17920], dtype: float32
 !!  - model.layers.0.mlp.gate_proj.scales map: device: cuda:0, shape: [1, 17920], dtype: float16, min: 0.003914, max: 0.053009, std: 0.001388
 !!  - model.layers.0.mlp.up_proj.qweight read: device: cpu, shape: [832, 17920], dtype: int32
 !!  - model.layers.0.mlp.up_proj.qweight map: device: cuda:0, shape: [832, 17920], dtype: int32, min: -2146989736, max: 2146015097
 !!  - model.layers.0.mlp.up_proj.qzeros read: device: cpu, shape: [1, 2240], dtype: int32
 !!  - model.layers.0.mlp.up_proj.qzeros map: device: cuda:0, shape: [1, 2240], dtype: int32, min: -2056882570, max: 2022078054
 !!  - model.layers.0.mlp.up_proj.scales read: device: cpu, shape: [1, 17920], dtype: float32
 !!  - model.layers.0.mlp.up_proj.scales map: device: cuda:0, shape: [1, 17920], dtype: float16, min: 0.003979, max: 0.028458, std: 0.000864
 !!  - model.layers.0.post_attention_layernorm.weight read: device: cpu, shape: [6656], dtype: bfloat16
 !!  - model.layers.0.post_attention_layernorm.weight map: device: cuda:0, shape: [6656], dtype: float16, min: -0.147461, max: 0.371094, std: 0.015366
 !!  - model.layers.0.self_attn.k_proj.qweight read: device: cpu, shape: [832, 6656], dtype: int32
 !!  - model.layers.0.self_attn.k_proj.qweight map: device: cuda:0, shape: [832, 6656], dtype: int32, min: -2145941149, max: 2144831623
 !!  - model.layers.0.self_attn.k_proj.qzeros read: device: cpu, shape: [1, 832], dtype: int32
 !!  - model.layers.0.self_attn.k_proj.qzeros map: device: cuda:0, shape: [1, 832], dtype: int32, min: -2058000042, max: 2022082182
 !!  - model.layers.0.self_attn.k_proj.scales read: device: cpu, shape: [1, 6656], dtype: float32
 !!  - model.layers.0.self_attn.k_proj.scales map: device: cuda:0, shape: [1, 6656], dtype: float16, min: 0.003426, max: 0.092468, std: 0.007042
 !!  - model.layers.0.self_attn.o_proj.qweight read: device: cpu, shape: [832, 6656], dtype: int32
 !!  - model.layers.0.self_attn.o_proj.qweight map: device: cuda:0, shape: [832, 6656], dtype: int32, min: -2146968999, max: 2147125857
 !!  - model.layers.0.self_attn.o_proj.qzeros read: device: cpu, shape: [1, 832], dtype: int32
 !!  - model.layers.0.self_attn.o_proj.qzeros map: device: cuda:0, shape: [1, 832], dtype: int32, min: -2057935001, max: 2022078310
 !!  - model.layers.0.self_attn.o_proj.scales read: device: cpu, shape: [1, 6656], dtype: float32
 !!  - model.layers.0.self_attn.o_proj.scales map: device: cuda:0, shape: [1, 6656], dtype: float16, min: 0.004063, max: 0.066406, std: 0.001719
 !!  - model.layers.0.self_attn.q_proj.qweight read: device: cpu, shape: [832, 6656], dtype: int32
 !!  - model.layers.0.self_attn.q_proj.qweight map: device: cuda:0, shape: [832, 6656], dtype: int32, min: -2144888953, max: 2144892026
 !!  - model.layers.0.self_attn.q_proj.qzeros read: device: cpu, shape: [1, 832], dtype: int32
 !!  - model.layers.0.self_attn.q_proj.qzeros map: device: cuda:0, shape: [1, 832], dtype: int32, min: -2055838105, max: 2022143846
 !!  - model.layers.0.self_attn.q_proj.scales read: device: cpu, shape: [1, 6656], dtype: float32
 !!  - model.layers.0.self_attn.q_proj.scales map: device: cuda:0, shape: [1, 6656], dtype: float16, min: 0.003687, max: 0.076843, std: 0.005463
 !!  - model.layers.0.self_attn.v_proj.qweight read: device: cpu, shape: [832, 6656], dtype: int32
 !!  - model.layers.0.self_attn.v_proj.qweight map: device: cuda:0, shape: [832, 6656], dtype: int32, min: -2145871737, max: 2146928759
 !!  - model.layers.0.self_attn.v_proj.qzeros read: device: cpu, shape: [1, 832], dtype: int32
 !!  - model.layers.0.self_attn.v_proj.qzeros map: device: cuda:0, shape: [1, 832], dtype: int32, min: -2040109209, max: 2021025654
 !!  - model.layers.0.self_attn.v_proj.scales read: device: cpu, shape: [1, 6656], dtype: float32
 !!  - model.layers.0.self_attn.v_proj.scales map: device: cuda:0, shape: [1, 6656], dtype: float16, min: 0.002279, max: 0.012497, std: 0.001468
 !!  - model.norm.weight read: device: cpu, shape: [6656], dtype: bfloat16
 !!  - model.norm.weight map: device: cuda:1, shape: [6656], dtype: float16, min: 0.140625, max: 1.851562, std: 0.055695
 !! Computing RoPE table for seq length: 2048
 !! - stored for device: cuda:0
 !! - stored for device: cuda:1
 ** Time, Load model: 3.75 seconds
 -- Groupsize (inferred): None
 -- Act-order (inferred): no
 ** VRAM, Model: [cuda:0] 4,142.07 MB - [cuda:1] 11,797.71 MB
 !! Inference, debug pass
 !! Begin forward pass
 !! Built initial hidden state: device: cpu, shape: [1, 1920, 6656], dtype: float16, min: -0.133789, max: 0.140625, std: 0.014366
 !! Prepared buffer for device: cuda:0
 !!  - attn_mask: device: cuda:0, shape: [1, 1, 1920, 1920], dtype: float16, min: -65504.000000, max: 0.000000, std: 32752.000000
 !! Prepared buffer for device: cuda:1
 !!  - attn_mask: device: cuda:1, shape: [1, 1, 1920, 1920], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000
 !! Moving hidden states from cpu to cuda:0
 !! Begin decoder 0
 !! Begin self-attention
 !!  - hidden_states: device: cuda:0, shape: [1, 1920, 6656], dtype: float16, min: -0.133789, max: 0.140625, std: 0.014366
 !!  - attn_mask: device: cuda:0, shape: [1, 1, 1920, 1920], dtype: float16, min: -65504.000000, max: 0.000000, std: 32752.000000
 !!  - layernorm.weight: device: cuda:0, shape: [6656], dtype: float16, min: 0.000099, max: 0.431641, std: 0.028687 eps: 0.00000100
 !!  - self_attn.q_proj: cuda:0 [Q] scales min/max/std: 0.003687/0.076843/0.005463
 !!  - self_attn.k_proj: cuda:0 [Q] scales min/max/std: 0.003426/0.092468/0.007042
 !!  - self_attn.v_proj: cuda:0 [Q] scales min/max/std: 0.002279/0.012497/0.001468
 !!  - self_attn.o_proj: cuda:0 [Q] scales min/max/std: 0.004063/0.066406/0.001719
 !!  - cache device: cuda:0, seq_len: 0
 !! Begin MLP
 !!  - hidden_states: device: cuda:0, shape: [1, 1920, 6656], dtype: float16, min: -1.573242, max: 1.157227, std: 0.028137
 !!  - layernorm.weight: device: cuda:0, shape: [6656], dtype: float16, min: -0.147461, max: 0.371094, std: 0.015366 eps: 0.00000100
 !!  - mlp.gate_proj: cuda:0 [Q] scales min/max/std: 0.003914/0.053009/0.001388
 !!  - mlp.up_proj: cuda:0 [Q] scales min/max/std: 0.003979/0.028458/0.000864
 !!  - mlp.down_proj: cuda:0 [Q] scales min/max/std: 0.006851/0.076843/0.001938
 !!  - method: normal
 !! Begin decoder 1
 !! Begin self-attention
 !!  - hidden_states: device: cuda:0, shape: [1, 1920, 6656], dtype: float16, min: -24.609375, max: 25.875000, std: 0.091553
 !!  - attn_mask: device: cuda:0, shape: [1, 1, 1920, 1920], dtype: float16, min: -65504.000000, max: 0.000000, std: 32752.000000
 !!  - layernorm.weight: device: cuda:0, shape: [6656], dtype: float16, min: 0.015381, max: 0.324219, std: 0.018555 eps: 0.00000100
 !!  - self_attn.q_proj: cuda:0 [Q] scales min/max/std: 0.002424/0.050140/0.005402
 !!  - self_attn.k_proj: cuda:0 [Q] scales min/max/std: 0.002337/0.058197/0.006481
 !!  - self_attn.v_proj: cuda:0 [Q] scales min/max/std: 0.002319/0.011330/0.001104
 !!  - self_attn.o_proj: cuda:0 [Q] scales min/max/std: 0.004444/0.069519/0.001976
 !!  - cache device: cuda:0, seq_len: 0
 !! Begin MLP
 !!  - hidden_states: device: cuda:0, shape: [1, 1920, 6656], dtype: float16, min: -22.484375, max: 24.375000, std: 0.075256
 !!  - layernorm.weight: device: cuda:0, shape: [6656], dtype: float16, min: -0.000671, max: 0.155273, std: 0.010117 eps: 0.00000100
 !!  - mlp.gate_proj: cuda:0 [Q] scales min/max/std: 0.006138/0.075256/0.001514
 !!  - mlp.up_proj: cuda:0 [Q] scales min/max/std: 0.005680/0.052460/0.001062
 !!  - mlp.down_proj: cuda:0 [Q] scales min/max/std: 0.007244/0.147339/0.002571
 !!  - method: normal
 !! Begin decoder 2
 !! Begin self-attention
 !!  - hidden_states: device: cuda:0, shape: [1, 1920, 6656], dtype: float16, min: -28.156250, max: 31.031250, std: 0.113525
 !!  - attn_mask: device: cuda:0, shape: [1, 1, 1920, 1920], dtype: float16, min: -65504.000000, max: 0.000000, std: 32752.000000
 !!  - layernorm.weight: device: cuda:0, shape: [6656], dtype: float16, min: 0.000816, max: 0.427734, std: 0.013031 eps: 0.00000100
 !!  - self_attn.q_proj: cuda:0 [Q] scales min/max/std: 0.004768/0.062744/0.005123
 !!  - self_attn.k_proj: cuda:0 [Q] scales min/max/std: 0.003914/0.052094/0.004726
 !!  - self_attn.v_proj: cuda:0 [Q] scales min/max/std: 0.003191/0.016769/0.000727
 !!  - self_attn.o_proj: cuda:0 [Q] scales min/max/std: 0.004917/0.069275/0.001596
 !!  - cache device: cuda:0, seq_len: 0
 !! Begin MLP
 !!  - hidden_states: device: cuda:0, shape: [1, 1920, 6656], dtype: float16, min: -27.312500, max: 30.296875, std: 0.108948
 !!  - layernorm.weight: device: cuda:0, shape: [6656], dtype: float16, min: 0.007019, max: 0.184570, std: 0.009277 eps: 0.00000100
 !!  - mlp.gate_proj: cuda:0 [Q] scales min/max/std: 0.006039/0.066162/0.001669
 !!  - mlp.up_proj: cuda:0 [Q] scales min/max/std: 0.005939/0.041138/0.000927
 !!  - mlp.down_proj: cuda:0 [Q] scales min/max/std: 0.007339/0.080444/0.001779
 !!  - method: normal
 !! Begin decoder 3
 !! Begin self-attention
 !!  - hidden_states: device: cuda:0, shape: [1, 1920, 6656], dtype: float16, min: -28.109375, max: 31.593750, std: 0.125488
 !!  - attn_mask: device: cuda:0, shape: [1, 1, 1920, 1920], dtype: float16, min: -65504.000000, max: 0.000000, std: 32752.000000
 !!  - layernorm.weight: device: cuda:0, shape: [6656], dtype: float16, min: 0.054688, max: 0.427734, std: 0.013016 eps: 0.00000100
 !!  - self_attn.q_proj: cuda:0 [Q] scales min/max/std: 0.004475/0.063416/0.004517
 !!  - self_attn.k_proj: cuda:0 [Q] scales min/max/std: 0.003727/0.061981/0.005016
 !!  - self_attn.v_proj: cuda:0 [Q] scales min/max/std: 0.003084/0.020569/0.000896
 !!  - self_attn.o_proj: cuda:0 [Q] scales min/max/std: 0.005047/0.087769/0.002256
 !!  - cache device: cuda:0, seq_len: 0
 !! Begin MLP
 !!  - hidden_states: device: cuda:0, shape: [1, 1920, 6656], dtype: float16, min: -24.203125, max: 26.046875, std: 0.128174
 !!  - layernorm.weight: device: cuda:0, shape: [6656], dtype: float16, min: 0.008911, max: 0.202148, std: 0.009621 eps: 0.00000100
 !!  - mlp.gate_proj: cuda:0 [Q] scales min/max/std: 0.005939/0.066040/0.001658
 !!  - mlp.up_proj: cuda:0 [Q] scales min/max/std: 0.005714/0.056122/0.001195
 !!  - mlp.down_proj: cuda:0 [Q] scales min/max/std: 0.007324/0.267822/0.003691
 !!  - method: normal
 !! Begin decoder 4
 !! Begin self-attention
 !!  - hidden_states: device: cuda:0, shape: [1, 1920, 6656], dtype: float16, min: -112.750000, max: 2638.000000, std: 0.761230
 !!  - attn_mask: device: cuda:0, shape: [1, 1, 1920, 1920], dtype: float16, min: -65504.000000, max: 0.000000, std: 32752.000000
 !!  - layernorm.weight: device: cuda:0, shape: [6656], dtype: float16, min: 0.017456, max: 0.480469, std: 0.015640 eps: 0.00000100
 !!  - self_attn.q_proj: cuda:0 [Q] scales min/max/std: 0.004036/0.045837/0.003664
 !!  - self_attn.k_proj: cuda:0 [Q] scales min/max/std: 0.003801/0.035950/0.003613
 !!  - self_attn.v_proj: cuda:0 [Q] scales min/max/std: 0.003468/0.015656/0.000794
 !!  - self_attn.o_proj: cuda:0 [Q] scales min/max/std: 0.005535/0.054565/0.001162
 !!  - cache device: cuda:0, seq_len: 0
 !! Begin MLP
 !!  - hidden_states: device: cuda:0, shape: [1, 1920, 6656], dtype: float16, min: -112.937500, max: 2638.000000, std: 0.763672
 !!  - layernorm.weight: device: cuda:0, shape: [6656], dtype: float16, min: 0.009338, max: 0.158203, std: 0.009377 eps: 0.00000100
 !!  - mlp.gate_proj: cuda:0 [Q] scales min/max/std: 0.006054/0.055481/0.001560
 !!  - mlp.up_proj: cuda:0 [Q] scales min/max/std: 0.005939/0.055725/0.001166
 !!  - mlp.down_proj: cuda:0 [Q] scales min/max/std: 0.007454/0.062866/0.001432
 !!  - method: normal
 !! Begin decoder 5
 !! Begin self-attention
 !!  - hidden_states: device: cuda:0, shape: [1, 1920, 6656], dtype: float16, min: -112.437500, max: 2640.000000, std: 0.768066
 !!  - attn_mask: device: cuda:0, shape: [1, 1, 1920, 1920], dtype: float16, min: -65504.000000, max: 0.000000, std: 32752.000000
 !!  - layernorm.weight: device: cuda:0, shape: [6656], dtype: float16, min: 0.020508, max: 0.478516, std: 0.016983 eps: 0.00000100
 !!  - self_attn.q_proj: cuda:0 [Q] scales min/max/std: 0.004158/0.042328/0.003759
 !!  - self_attn.k_proj: cuda:0 [Q] scales min/max/std: 0.004135/0.033783/0.003658
 !!  - self_attn.v_proj: cuda:0 [Q] scales min/max/std: 0.004028/0.013931/0.000834
 !!  - self_attn.o_proj: cuda:0 [Q] scales min/max/std: 0.005470/0.041016/0.001056
 !!  - cache device: cuda:0, seq_len: 0
 !! Begin MLP
 !!  - hidden_states: device: cuda:0, shape: [1, 1920, 6656], dtype: float16, min: -112.937500, max: 2640.000000, std: 0.770020
 !!  - layernorm.weight: device: cuda:0, shape: [6656], dtype: float16, min: 0.025024, max: 0.162109, std: 0.008591 eps: 0.00000100
 !!  - mlp.gate_proj: cuda:0 [Q] scales min/max/std: 0.005760/0.046295/0.001429
 !!  - mlp.up_proj: cuda:0 [Q] scales min/max/std: 0.005550/0.052094/0.001036
 !!  - mlp.down_proj: cuda:0 [Q] scales min/max/std: 0.007374/0.055206/0.001354
 !!  - method: normal
 !! Begin decoder 6
 !! Begin self-attention
 !!  - hidden_states: device: cuda:0, shape: [1, 1920, 6656], dtype: float16, min: -112.687500, max: 2642.000000, std: 0.774414
 !!  - attn_mask: device: cuda:0, shape: [1, 1, 1920, 1920], dtype: float16, min: -65504.000000, max: 0.000000, std: 32752.000000
 !!  - layernorm.weight: device: cuda:0, shape: [6656], dtype: float16, min: 0.018799, max: 0.472656, std: 0.015656 eps: 0.00000100
 !!  - self_attn.q_proj: cuda:0 [Q] scales min/max/std: 0.004192/0.052856/0.004028
 !!  - self_attn.k_proj: cuda:0 [Q] scales min/max/std: 0.004208/0.035034/0.003809
 !!  - self_attn.v_proj: cuda:0 [Q] scales min/max/std: 0.004158/0.014488/0.000650
 !!  - self_attn.o_proj: cuda:0 [Q] scales min/max/std: 0.005615/0.058594/0.001074
 !!  - cache device: cuda:0, seq_len: 0
 !! Begin MLP
 !!  - hidden_states: device: cuda:0, shape: [1, 1920, 6656], dtype: float16, min: -113.062500, max: 2642.000000, std: 0.778320
 !!  - layernorm.weight: device: cuda:0, shape: [6656], dtype: float16, min: 0.029297, max: 0.173828, std: 0.008652 eps: 0.00000100
 !!  - mlp.gate_proj: cuda:0 [Q] scales min/max/std: 0.006023/0.051697/0.001435
 !!  - mlp.up_proj: cuda:0 [Q] scales min/max/std: 0.006023/0.049622/0.001033
 !!  - mlp.down_proj: cuda:0 [Q] scales min/max/std: 0.007359/0.056915/0.001345
 !!  - method: normal
 !! Begin decoder 7
 !! Begin self-attention
 !!  - hidden_states: device: cuda:0, shape: [1, 1920, 6656], dtype: float16, min: -113.062500, max: 2642.000000, std: 0.782715
 !!  - attn_mask: device: cuda:0, shape: [1, 1, 1920, 1920], dtype: float16, min: -65504.000000, max: 0.000000, std: 32752.000000
 !!  - layernorm.weight: device: cuda:0, shape: [6656], dtype: float16, min: 0.023682, max: 0.492188, std: 0.014145 eps: 0.00000100
 !!  - self_attn.q_proj: cuda:0 [Q] scales min/max/std: 0.004395/0.043488/0.003618
 !!  - self_attn.k_proj: cuda:0 [Q] scales min/max/std: 0.004539/0.032104/0.003489
 !!  - self_attn.v_proj: cuda:0 [Q] scales min/max/std: 0.004982/0.012306/0.000669
 !!  - self_attn.o_proj: cuda:0 [Q] scales min/max/std: 0.005760/0.042175/0.000939
 !!  - cache device: cuda:0, seq_len: 0
 !! Begin MLP
 !!  - hidden_states: device: cuda:0, shape: [1, 1920, 6656], dtype: float16, min: -113.687500, max: 2642.000000, std: 0.787598
 !!  - layernorm.weight: device: cuda:0, shape: [6656], dtype: float16, min: 0.026489, max: 0.183594, std: 0.009079 eps: 0.00000100
 !!  - mlp.gate_proj: cuda:0 [Q] scales min/max/std: 0.006382/0.038269/0.001271
 !!  - mlp.up_proj: cuda:0 [Q] scales min/max/std: 0.006382/0.050140/0.001077
 !!  - mlp.down_proj: cuda:0 [Q] scales min/max/std: 0.007210/0.052338/0.001207
 !!  - method: normal
 !! Begin decoder 8
 !! Begin self-attention
 !!  - hidden_states: device: cuda:0, shape: [1, 1920, 6656], dtype: float16, min: -113.687500, max: 2642.000000, std: 0.793457
 !!  - attn_mask: device: cuda:0, shape: [1, 1, 1920, 1920], dtype: float16, min: -65504.000000, max: 0.000000, std: 32752.000000
 !!  - layernorm.weight: device: cuda:0, shape: [6656], dtype: float16, min: 0.026978, max: 0.531250, std: 0.014130 eps: 0.00000100
 !!  - self_attn.q_proj: cuda:0 [Q] scales min/max/std: 0.003889/0.046234/0.003653
 !!  - self_attn.k_proj: cuda:0 [Q] scales min/max/std: 0.003979/0.031189/0.003502
 !!  - self_attn.v_proj: cuda:0 [Q] scales min/max/std: 0.004803/0.012077/0.000616
 !!  - self_attn.o_proj: cuda:0 [Q] scales min/max/std: 0.005909/0.044922/0.000819
 !!  - cache device: cuda:0, seq_len: 0
 !! Begin MLP
 !!  - hidden_states: device: cuda:0, shape: [1, 1920, 6656], dtype: float16, min: -114.187500, max: 2642.000000, std: 0.799316
 !!  - layernorm.weight: device: cuda:0, shape: [6656], dtype: float16, min: 0.037109, max: 0.194336, std: 0.008949 eps: 0.00000100
 !!  - mlp.gate_proj: cuda:0 [Q] scales min/max/std: 0.006462/0.044006/0.001260
 !!  - mlp.up_proj: cuda:0 [Q] scales min/max/std: 0.006542/0.049744/0.001063
 !!  - mlp.down_proj: cuda:0 [Q] scales min/max/std: 0.007534/0.072937/0.001604
 !!  - method: normal
 !! Begin decoder 9
 !! Begin self-attention
 !!  - hidden_states: device: cuda:0, shape: [1, 1920, 6656], dtype: float16, min: -114.187500, max: 2644.000000, std: 0.802734
 !!  - attn_mask: device: cuda:0, shape: [1, 1, 1920, 1920], dtype: float16, min: -65504.000000, max: 0.000000, std: 32752.000000
 !!  - layernorm.weight: device: cuda:0, shape: [6656], dtype: float16, min: 0.031250, max: 0.527344, std: 0.014053 eps: 0.00000100
 !!  - self_attn.q_proj: cuda:0 [Q] scales min/max/std: 0.004444/0.045441/0.003422
 !!  - self_attn.k_proj: cuda:0 [Q] scales min/max/std: 0.004379/0.032166/0.003338
 !!  - self_attn.v_proj: cuda:0 [Q] scales min/max/std: 0.004589/0.014809/0.000655
 !!  - self_attn.o_proj: cuda:0 [Q] scales min/max/std: 0.005989/0.047913/0.001200
 !!  - cache device: cuda:0, seq_len: 0
 !! Begin MLP
 !!  - hidden_states: device: cuda:0, shape: [1, 1920, 6656], dtype: float16, min: -114.437500, max: 2644.000000, std: 0.805176
 !!  - layernorm.weight: device: cuda:0, shape: [6656], dtype: float16, min: 0.051270, max: 0.249023, std: 0.008759 eps: 0.00000100
 !!  - mlp.gate_proj: cuda:0 [Q] scales min/max/std: 0.006332/0.044128/0.001369
 !!  - mlp.up_proj: cuda:0 [Q] scales min/max/std: 0.006542/0.048187/0.001155
 !!  - mlp.down_proj: cuda:0 [Q] scales min/max/std: 0.007454/0.119019/0.002102
 !!  - method: normal
 !! Begin decoder 10
 !! Begin self-attention
 !!  - hidden_states: device: cuda:0, shape: [1, 1920, 6656], dtype: float16, min: -112.250000, max: 2672.000000, std: 0.816895
 !!  - attn_mask: device: cuda:0, shape: [1, 1, 1920, 1920], dtype: float16, min: -65504.000000, max: 0.000000, std: 32752.000000
 !!  - layernorm.weight: device: cuda:0, shape: [6656], dtype: float16, min: 0.039062, max: 0.550781, std: 0.014366 eps: 0.00000100
 !!  - self_attn.q_proj: cuda:0 [Q] scales min/max/std: 0.004013/0.052612/0.003538
 !!  - self_attn.k_proj: cuda:0 [Q] scales min/max/std: 0.004108/0.037903/0.003376
 !!  - self_attn.v_proj: cuda:0 [Q] scales min/max/std: 0.005028/0.012306/0.000686
 !!  - self_attn.o_proj: cuda:0 [Q] scales min/max/std: 0.006054/0.056641/0.001063
 !!  - cache device: cuda:0, seq_len: 0
 !! Begin MLP
 !!  - hidden_states: device: cuda:0, shape: [1, 1920, 6656], dtype: float16, min: -112.437500, max: 2672.000000, std: 0.820312
 !!  - layernorm.weight: device: cuda:0, shape: [6656], dtype: float16, min: 0.052246, max: 0.439453, std: 0.009697 eps: 0.00000100
 !!  - mlp.gate_proj: cuda:0 [Q] scales min/max/std: 0.005795/0.053131/0.001467
 !!  - mlp.up_proj: cuda:0 [Q] scales min/max/std: 0.006512/0.050903/0.001198
 !!  - mlp.down_proj: cuda:0 [Q] scales min/max/std: 0.007534/0.173462/0.002787
 !!  - method: normal
 !! Begin decoder 11
 !! Begin self-attention
 !!  - hidden_states: device: cuda:0, shape: [1, 1920, 6656], dtype: float16, min: -96.125000, max: 3004.000000, std: 0.922363
 !!  - attn_mask: device: cuda:0, shape: [1, 1, 1920, 1920], dtype: float16, min: -65504.000000, max: 0.000000, std: 32752.000000
 !!  - layernorm.weight: device: cuda:0, shape: [6656], dtype: float16, min: 0.046143, max: 0.574219, std: 0.015808 eps: 0.00000100
 !!  - self_attn.q_proj: cuda:0 [Q] scales min/max/std: 0.004021/0.051056/0.004143
 !!  - self_attn.k_proj: cuda:0 [Q] scales min/max/std: 0.003994/0.034119/0.003323
 !!  - self_attn.v_proj: cuda:0 [Q] scales min/max/std: 0.005028/0.017609/0.000787
 !!  - self_attn.o_proj: cuda:0 [Q] scales min/max/std: 0.006104/0.071594/0.001420
 !!  - cache device: cuda:0, seq_len: 0
 !! Begin MLP
 !!  - hidden_states: device: cuda:0, shape: [1, 1920, 6656], dtype: float16, min: -96.500000, max: 3004.000000, std: 0.924805
 !!  - layernorm.weight: device: cuda:0, shape: [6656], dtype: float16, min: 0.041016, max: 0.218750, std: 0.009247 eps: 0.00000100
 !!  - mlp.gate_proj: cuda:0 [Q] scales min/max/std: 0.006248/0.064209/0.001389
 !!  - mlp.up_proj: cuda:0 [Q] scales min/max/std: 0.006641/0.054565/0.001169
 !!  - mlp.down_proj: cuda:0 [Q] scales min/max/std: 0.007259/0.103394/0.001874
 !!  - method: normal
 !! Begin decoder 12
 !! Begin self-attention
 !!  - hidden_states: device: cuda:0, shape: [1, 1920, 6656], dtype: float16, min: -97.062500, max: 3006.000000, std: 0.927246
 !!  - attn_mask: device: cuda:0, shape: [1, 1, 1920, 1920], dtype: float16, min: -65504.000000, max: 0.000000, std: 32752.000000
 !!  - layernorm.weight: device: cuda:0, shape: [6656], dtype: float16, min: 0.052246, max: 0.601562, std: 0.015060 eps: 0.00000100
 !!  - self_attn.q_proj: cuda:0 [Q] scales min/max/std: 0.004402/0.053131/0.003906
 !!  - self_attn.k_proj: cuda:0 [Q] scales min/max/std: 0.003906/0.042969/0.003477
 !!  - self_attn.v_proj: cuda:0 [Q] scales min/max/std: 0.005226/0.018158/0.000777
 !!  - self_attn.o_proj: cuda:0 [Q] scales min/max/std: 0.006069/0.082825/0.001306
 !!  - cache device: cuda:0, seq_len: 0
 !! Begin MLP
 !!  - hidden_states: device: cuda:0, shape: [1, 1920, 6656], dtype: float16, min: -97.625000, max: 3006.000000, std: 0.934082
 !!  - layernorm.weight: device: cuda:0, shape: [6656], dtype: float16, min: 0.059570, max: 0.228516, std: 0.008789 eps: 0.00000100
 !!  - mlp.gate_proj: cuda:0 [Q] scales min/max/std: 0.005695/0.050140/0.001423
 !!  - mlp.up_proj: cuda:0 [Q] scales min/max/std: 0.006218/0.060944/0.001321
 !!  - mlp.down_proj: cuda:0 [Q] scales min/max/std: 0.007359/0.105713/0.001951
 !!  - method: normal
 !! Begin decoder 13
 !! Begin self-attention
 !!  - hidden_states: device: cuda:0, shape: [1, 1920, 6656], dtype: float16, min: -97.812500, max: 3008.000000, std: 0.935547
 !!  - attn_mask: device: cuda:0, shape: [1, 1, 1920, 1920], dtype: float16, min: -65504.000000, max: 0.000000, std: 32752.000000
 !!  - layernorm.weight: device: cuda:0, shape: [6656], dtype: float16, min: 0.064941, max: 0.613281, std: 0.016144 eps: 0.00000100
 !!  - self_attn.q_proj: cuda:0 [Q] scales min/max/std: 0.004116/0.049225/0.003771
 !!  - self_attn.k_proj: cuda:0 [Q] scales min/max/std: 0.004044/0.036194/0.003403
 !!  - self_attn.v_proj: cuda:0 [Q] scales min/max/std: 0.003922/0.019241/0.000883
 !!  - self_attn.o_proj: cuda:0 [Q] scales min/max/std: 0.006184/0.059113/0.001060
 !!  - cache device: cuda:0, seq_len: 0
 !! Begin MLP
 !!  - hidden_states: device: cuda:0, shape: [1, 1920, 6656], dtype: float16, min: -98.312500, max: 3008.000000, std: 0.938477
 !!  - layernorm.weight: device: cuda:0, shape: [6656], dtype: float16, min: 0.050049, max: 0.229492, std: 0.008698 eps: 0.00000100
 !!  - mlp.gate_proj: cuda:0 [Q] scales min/max/std: 0.006023/0.040558/0.001410
 !!  - mlp.up_proj: cuda:0 [Q] scales min/max/std: 0.006332/0.045776/0.001092
 !!  - mlp.down_proj: cuda:0 [Q] scales min/max/std: 0.007633/0.083618/0.001544
 !!  - method: normal
 !! Begin decoder 14
 !! Begin self-attention
 !!  - hidden_states: device: cuda:0, shape: [1, 1920, 6656], dtype: float16, min: -98.562500, max: 3008.000000, std: 0.940430
 !!  - attn_mask: device: cuda:0, shape: [1, 1, 1920, 1920], dtype: float16, min: -65504.000000, max: 0.000000, std: 32752.000000
 !!  - layernorm.weight: device: cuda:0, shape: [6656], dtype: float16, min: 0.067871, max: 0.640625, std: 0.016388 eps: 0.00000100
 !!  - self_attn.q_proj: cuda:0 [Q] scales min/max/std: 0.004021/0.046356/0.003773
 !!  - self_attn.k_proj: cuda:0 [Q] scales min/max/std: 0.003752/0.035431/0.003366
 !!  - self_attn.v_proj: cuda:0 [Q] scales min/max/std: 0.005142/0.018295/0.000790
 !!  - self_attn.o_proj: cuda:0 [Q] scales min/max/std: 0.006248/0.069519/0.001388
 !!  - cache device: cuda:0, seq_len: 0
 !! Begin MLP
 !!  - hidden_states: device: cuda:0, shape: [1, 1920, 6656], dtype: float16, min: -99.062500, max: 3008.000000, std: 0.945801
 !!  - layernorm.weight: device: cuda:0, shape: [6656], dtype: float16, min: 0.049805, max: 0.229492, std: 0.008499 eps: 0.00000100
 !!  - mlp.gate_proj: cuda:0 [Q] scales min/max/std: 0.005959/0.048950/0.001568
 !!  - mlp.up_proj: cuda:0 [Q] scales min/max/std: 0.006447/0.066467/0.001188
 !!  - mlp.down_proj: cuda:0 [Q] scales min/max/std: 0.007404/0.095337/0.001607
 !!  - method: normal
 !! Begin decoder 15
 !! Begin self-attention
 !!  - hidden_states: device: cuda:0, shape: [1, 1920, 6656], dtype: float16, min: -99.125000, max: 3008.000000, std: 0.945801
 !!  - attn_mask: device: cuda:0, shape: [1, 1, 1920, 1920], dtype: float16, min: -65504.000000, max: 0.000000, std: 32752.000000
 !!  - layernorm.weight: device: cuda:0, shape: [6656], dtype: float16, min: 0.069336, max: 0.664062, std: 0.015411 eps: 0.00000100
 !!  - self_attn.q_proj: cuda:0 [Q] scales min/max/std: 0.004143/0.051575/0.003738
 !!  - self_attn.k_proj: cuda:0 [Q] scales min/max/std: 0.003906/0.040497/0.003508
 !!  - self_attn.v_proj: cuda:0 [Q] scales min/max/std: 0.005436/0.019791/0.000878
 !!  - self_attn.o_proj: cuda:0 [Q] scales min/max/std: 0.005760/0.103882/0.001384
 !!  - cache device: cuda:0, seq_len: 0
 !! Begin MLP
 !!  - hidden_states: device: cuda:0, shape: [1, 1920, 6656], dtype: float16, min: -99.562500, max: 3008.000000, std: 0.954102
 !!  - layernorm.weight: device: cuda:0, shape: [6656], dtype: float16, min: 0.065430, max: 0.235352, std: 0.008469 eps: 0.00000100
 !!  - mlp.gate_proj: cuda:0 [Q] scales min/max/std: 0.005028/0.074768/0.001600
 !!  - mlp.up_proj: cuda:0 [Q] scales min/max/std: 0.006348/0.064697/0.001232
 !!  - mlp.down_proj: cuda:0 [Q] scales min/max/std: 0.007439/0.103882/0.002073
 !!  - method: normal
 !! Moving hidden states from cuda:0 to cuda:1
 !! Begin decoder 16
 !! Begin self-attention
 !!  - hidden_states: device: cuda:1, shape: [1, 1920, 6656], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000
 !!  - attn_mask: device: cuda:1, shape: [1, 1, 1920, 1920], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000
 !!  - layernorm.weight: device: cuda:1, shape: [6656], dtype: float16, min: 0.059570, max: 0.671875, std: 0.015488 eps: 0.00000100
 !!  - self_attn.q_proj: cuda:1 [Q] scales min/max/std: 0.004230/0.053131/0.003641
 !!  - self_attn.k_proj: cuda:1 [Q] scales min/max/std: 0.003994/0.037048/0.003208
 !!  - self_attn.v_proj: cuda:1 [Q] scales min/max/std: 0.005451/0.025070/0.000992
 !!  - self_attn.o_proj: cuda:1 [Q] scales min/max/std: 0.006023/0.060425/0.001118
 !!  - cache device: cuda:1, seq_len: 0
 !! Begin MLP
 !!  - hidden_states: device: cuda:1, shape: [1, 1920, 6656], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000
 !!  - layernorm.weight: device: cuda:1, shape: [6656], dtype: float16, min: 0.035156, max: 0.241211, std: 0.009331 eps: 0.00000100
 !!  - mlp.gate_proj: cuda:1 [Q] scales min/max/std: 0.005974/0.067078/0.001477
 !!  - mlp.up_proj: cuda:1 [Q] scales min/max/std: 0.006657/0.039703/0.001064
 !!  - mlp.down_proj: cuda:1 [Q] scales min/max/std: 0.007553/0.096619/0.001308
 !!  - method: normal
 !! Begin decoder 17
 !! Begin self-attention
 !!  - hidden_states: device: cuda:1, shape: [1, 1920, 6656], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000
 !!  - attn_mask: device: cuda:1, shape: [1, 1, 1920, 1920], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000
 !!  - layernorm.weight: device: cuda:1, shape: [6656], dtype: float16, min: 0.060791, max: 0.671875, std: 0.015564 eps: 0.00000100
 !!  - self_attn.q_proj: cuda:1 [Q] scales min/max/std: 0.003914/0.048035/0.003668
 !!  - self_attn.k_proj: cuda:1 [Q] scales min/max/std: 0.003134/0.035675/0.003126
 !!  - self_attn.v_proj: cuda:1 [Q] scales min/max/std: 0.004890/0.022491/0.000873
 !!  - self_attn.o_proj: cuda:1 [Q] scales min/max/std: 0.006447/0.095825/0.001544
 !!  - cache device: cuda:1, seq_len: 0
 !! Begin MLP
 !!  - hidden_states: device: cuda:1, shape: [1, 1920, 6656], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000
 !!  - layernorm.weight: device: cuda:1, shape: [6656], dtype: float16, min: 0.035400, max: 0.244141, std: 0.009109 eps: 0.00000100
 !!  - mlp.gate_proj: cuda:1 [Q] scales min/max/std: 0.006039/0.052856/0.001426
 !!  - mlp.up_proj: cuda:1 [Q] scales min/max/std: 0.006721/0.053528/0.001071
 !!  - mlp.down_proj: cuda:1 [Q] scales min/max/std: 0.007359/0.071350/0.001134
 !!  - method: normal
 !! Begin decoder 18
 !! Begin self-attention
 !!  - hidden_states: device: cuda:1, shape: [1, 1920, 6656], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000
 !!  - attn_mask: device: cuda:1, shape: [1, 1, 1920, 1920], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000
 !!  - layernorm.weight: device: cuda:1, shape: [6656], dtype: float16, min: 0.068848, max: 0.679688, std: 0.015129 eps: 0.00000100
 !!  - self_attn.q_proj: cuda:1 [Q] scales min/max/std: 0.003613/0.045837/0.003527
 !!  - self_attn.k_proj: cuda:1 [Q] scales min/max/std: 0.003523/0.039978/0.003304
 !!  - self_attn.v_proj: cuda:1 [Q] scales min/max/std: 0.004982/0.015686/0.000898
 !!  - self_attn.o_proj: cuda:1 [Q] scales min/max/std: 0.005760/0.064819/0.000910
 !!  - cache device: cuda:1, seq_len: 0
 !! Begin MLP
 !!  - hidden_states: device: cuda:1, shape: [1, 1920, 6656], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000
 !!  - layernorm.weight: device: cuda:1, shape: [6656], dtype: float16, min: 0.044922, max: 0.245117, std: 0.008545 eps: 0.00000100
 !!  - mlp.gate_proj: cuda:1 [Q] scales min/max/std: 0.005142/0.056000/0.001453
 !!  - mlp.up_proj: cuda:1 [Q] scales min/max/std: 0.006641/0.053253/0.001026
 !!  - mlp.down_proj: cuda:1 [Q] scales min/max/std: 0.007584/0.077881/0.001364
 !!  - method: normal
 !! Begin decoder 19
 !! Begin self-attention
 !!  - hidden_states: device: cuda:1, shape: [1, 1920, 6656], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000
 !!  - attn_mask: device: cuda:1, shape: [1, 1, 1920, 1920], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000
 !!  - layernorm.weight: device: cuda:1, shape: [6656], dtype: float16, min: 0.065430, max: 0.687500, std: 0.014000 eps: 0.00000100
 !!  - self_attn.q_proj: cuda:1 [Q] scales min/max/std: 0.003801/0.046082/0.003609
 !!  - self_attn.k_proj: cuda:1 [Q] scales min/max/std: 0.003393/0.037170/0.003099
 !!  - self_attn.v_proj: cuda:1 [Q] scales min/max/std: 0.005112/0.022858/0.000912
 !!  - self_attn.o_proj: cuda:1 [Q] scales min/max/std: 0.005894/0.059906/0.001084
 !!  - cache device: cuda:1, seq_len: 0
 !! Begin MLP
 !!  - hidden_states: device: cuda:1, shape: [1, 1920, 6656], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000
 !!  - layernorm.weight: device: cuda:1, shape: [6656], dtype: float16, min: 0.039062, max: 0.249023, std: 0.008957 eps: 0.00000100
 !!  - mlp.gate_proj: cuda:1 [Q] scales min/max/std: 0.005161/0.037506/0.001387
 !!  - mlp.up_proj: cuda:1 [Q] scales min/max/std: 0.006382/0.036560/0.001002
 !!  - mlp.down_proj: cuda:1 [Q] scales min/max/std: 0.007668/0.082825/0.001348
 !!  - method: normal
 !! Begin decoder 20
 !! Begin self-attention
 !!  - hidden_states: device: cuda:1, shape: [1, 1920, 6656], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000
 !!  - attn_mask: device: cuda:1, shape: [1, 1, 1920, 1920], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000
 !!  - layernorm.weight: device: cuda:1, shape: [6656], dtype: float16, min: 0.056641, max: 0.703125, std: 0.013672 eps: 0.00000100
 !!  - self_attn.q_proj: cuda:1 [Q] scales min/max/std: 0.003784/0.052338/0.003933
 !!  - self_attn.k_proj: cuda:1 [Q] scales min/max/std: 0.003353/0.035614/0.003248
 !!  - self_attn.v_proj: cuda:1 [Q] scales min/max/std: 0.005272/0.020309/0.000875
 !!  - self_attn.o_proj: cuda:1 [Q] scales min/max/std: 0.006577/0.106506/0.002047
 !!  - cache device: cuda:1, seq_len: 0
 !! Begin MLP
 !!  - hidden_states: device: cuda:1, shape: [1, 1920, 6656], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000
 !!  - layernorm.weight: device: cuda:1, shape: [6656], dtype: float16, min: 0.025269, max: 0.257812, std: 0.009445 eps: 0.00000100
 !!  - mlp.gate_proj: cuda:1 [Q] scales min/max/std: 0.006088/0.049469/0.001395
 !!  - mlp.up_proj: cuda:1 [Q] scales min/max/std: 0.006721/0.061584/0.001181
 !!  - mlp.down_proj: cuda:1 [Q] scales min/max/std: 0.007633/0.085693/0.001389
 !!  - method: normal
 !! Begin decoder 21
 !! Begin self-attention
 !!  - hidden_states: device: cuda:1, shape: [1, 1920, 6656], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000
 !!  - attn_mask: device: cuda:1, shape: [1, 1, 1920, 1920], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000
 !!  - layernorm.weight: device: cuda:1, shape: [6656], dtype: float16, min: 0.059814, max: 0.734375, std: 0.013329 eps: 0.00000100
 !!  - self_attn.q_proj: cuda:1 [Q] scales min/max/std: 0.003801/0.053009/0.003752
 !!  - self_attn.k_proj: cuda:1 [Q] scales min/max/std: 0.003256/0.033722/0.003275
 !!  - self_attn.v_proj: cuda:1 [Q] scales min/max/std: 0.005386/0.020508/0.000772
 !!  - self_attn.o_proj: cuda:1 [Q] scales min/max/std: 0.006233/0.066162/0.001053
 !!  - cache device: cuda:1, seq_len: 0
 !! Begin MLP
 !!  - hidden_states: device: cuda:1, shape: [1, 1920, 6656], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000
 !!  - layernorm.weight: device: cuda:1, shape: [6656], dtype: float16, min: 0.035400, max: 0.257812, std: 0.009483 eps: 0.00000100
 !!  - mlp.gate_proj: cuda:1 [Q] scales min/max/std: 0.006313/0.056824/0.001446
 !!  - mlp.up_proj: cuda:1 [Q] scales min/max/std: 0.006641/0.038788/0.000963
 !!  - mlp.down_proj: cuda:1 [Q] scales min/max/std: 0.007713/0.060425/0.001002
 !!  - method: normal
 !! Begin decoder 22
 !! Begin self-attention
 !!  - hidden_states: device: cuda:1, shape: [1, 1920, 6656], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000
 !!  - attn_mask: device: cuda:1, shape: [1, 1, 1920, 1920], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000
 !!  - layernorm.weight: device: cuda:1, shape: [6656], dtype: float16, min: 0.059326, max: 0.703125, std: 0.012497 eps: 0.00000100
 !!  - self_attn.q_proj: cuda:1 [Q] scales min/max/std: 0.002922/0.052460/0.004002
 !!  - self_attn.k_proj: cuda:1 [Q] scales min/max/std: 0.002678/0.033142/0.003107
 !!  - self_attn.v_proj: cuda:1 [Q] scales min/max/std: 0.006039/0.018646/0.000729
 !!  - self_attn.o_proj: cuda:1 [Q] scales min/max/std: 0.006592/0.086182/0.001471
 !!  - cache device: cuda:1, seq_len: 0
 !! Begin MLP
 !!  - hidden_states: device: cuda:1, shape: [1, 1920, 6656], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000
 !!  - layernorm.weight: device: cuda:1, shape: [6656], dtype: float16, min: 0.020386, max: 0.267578, std: 0.010315 eps: 0.00000100
 !!  - mlp.gate_proj: cuda:1 [Q] scales min/max/std: 0.006397/0.047668/0.001308
 !!  - mlp.up_proj: cuda:1 [Q] scales min/max/std: 0.006542/0.045624/0.001020
 !!  - mlp.down_proj: cuda:1 [Q] scales min/max/std: 0.007713/0.107056/0.001574
 !!  - method: normal
 !! Begin decoder 23
 !! Begin self-attention
 !!  - hidden_states: device: cuda:1, shape: [1, 1920, 6656], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000
 !!  - attn_mask: device: cuda:1, shape: [1, 1, 1920, 1920], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000
 !!  - layernorm.weight: device: cuda:1, shape: [6656], dtype: float16, min: 0.060303, max: 0.765625, std: 0.014275 eps: 0.00000100
 !!  - self_attn.q_proj: cuda:1 [Q] scales min/max/std: 0.003565/0.055847/0.004059
 !!  - self_attn.k_proj: cuda:1 [Q] scales min/max/std: 0.002800/0.033905/0.003366
 !!  - self_attn.v_proj: cuda:1 [Q] scales min/max/std: 0.005077/0.016434/0.000742
 !!  - self_attn.o_proj: cuda:1 [Q] scales min/max/std: 0.006382/0.047272/0.000874
 !!  - cache device: cuda:1, seq_len: 0
 !! Begin MLP
 !!  - hidden_states: device: cuda:1, shape: [1, 1920, 6656], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000
 !!  - layernorm.weight: device: cuda:1, shape: [6656], dtype: float16, min: 0.022949, max: 0.275391, std: 0.010490 eps: 0.00000100
 !!  - mlp.gate_proj: cuda:1 [Q] scales min/max/std: 0.006203/0.045044/0.001418
 !!  - mlp.up_proj: cuda:1 [Q] scales min/max/std: 0.006577/0.035034/0.000887
 !!  - mlp.down_proj: cuda:1 [Q] scales min/max/std: 0.007553/0.071472/0.001110
 !!  - method: normal
 !! Begin decoder 24
 !! Begin self-attention
 !!  - hidden_states: device: cuda:1, shape: [1, 1920, 6656], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000
 !!  - attn_mask: device: cuda:1, shape: [1, 1, 1920, 1920], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000
 !!  - layernorm.weight: device: cuda:1, shape: [6656], dtype: float16, min: 0.060547, max: 0.703125, std: 0.013268 eps: 0.00000100
 !!  - self_attn.q_proj: cuda:1 [Q] scales min/max/std: 0.003345/0.049866/0.004017
 !!  - self_attn.k_proj: cuda:1 [Q] scales min/max/std: 0.002897/0.032166/0.003105
 !!  - self_attn.v_proj: cuda:1 [Q] scales min/max/std: 0.005859/0.016769/0.000706
 !!  - self_attn.o_proj: cuda:1 [Q] scales min/max/std: 0.006771/0.057556/0.001293
 !!  - cache device: cuda:1, seq_len: 0
 !! Begin MLP
 !!  - hidden_states: device: cuda:1, shape: [1, 1920, 6656], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000
 !!  - layernorm.weight: device: cuda:1, shape: [6656], dtype: float16, min: 0.021973, max: 0.279297, std: 0.010460 eps: 0.00000100
 !!  - mlp.gate_proj: cuda:1 [Q] scales min/max/std: 0.006382/0.041412/0.001295
 !!  - mlp.up_proj: cuda:1 [Q] scales min/max/std: 0.006771/0.058868/0.001081
 !!  - mlp.down_proj: cuda:1 [Q] scales min/max/std: 0.007668/0.084534/0.001464
 !!  - method: normal
 !! Begin decoder 25
 !! Begin self-attention
 !!  - hidden_states: device: cuda:1, shape: [1, 1920, 6656], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000
 !!  - attn_mask: device: cuda:1, shape: [1, 1, 1920, 1920], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000
 !!  - layernorm.weight: device: cuda:1, shape: [6656], dtype: float16, min: 0.062988, max: 0.679688, std: 0.013092 eps: 0.00000100
 !!  - self_attn.q_proj: cuda:1 [Q] scales min/max/std: 0.003069/0.050903/0.003702
 !!  - self_attn.k_proj: cuda:1 [Q] scales min/max/std: 0.002857/0.039062/0.003319
 !!  - self_attn.v_proj: cuda:1 [Q] scales min/max/std: 0.005581/0.018814/0.000751
 !!  - self_attn.o_proj: cuda:1 [Q] scales min/max/std: 0.006657/0.099243/0.001510
 !!  - cache device: cuda:1, seq_len: 0
 !! Begin MLP
 !!  - hidden_states: device: cuda:1, shape: [1, 1920, 6656], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000
 !!  - layernorm.weight: device: cuda:1, shape: [6656], dtype: float16, min: 0.031982, max: 0.285156, std: 0.010880 eps: 0.00000100
 !!  - mlp.gate_proj: cuda:1 [Q] scales min/max/std: 0.005581/0.047394/0.001300
 !!  - mlp.up_proj: cuda:1 [Q] scales min/max/std: 0.006706/0.043243/0.001004
 !!  - mlp.down_proj: cuda:1 [Q] scales min/max/std: 0.007618/0.081482/0.001511
 !!  - method: normal
 !! Begin decoder 26
 !! Begin self-attention
 !!  - hidden_states: device: cuda:1, shape: [1, 1920, 6656], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000
 !!  - attn_mask: device: cuda:1, shape: [1, 1, 1920, 1920], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000
 !!  - layernorm.weight: device: cuda:1, shape: [6656], dtype: float16, min: 0.053467, max: 0.718750, std: 0.013550 eps: 0.00000100
 !!  - self_attn.q_proj: cuda:1 [Q] scales min/max/std: 0.003443/0.057953/0.004169
 !!  - self_attn.k_proj: cuda:1 [Q] scales min/max/std: 0.003181/0.035095/0.003294
 !!  - self_attn.v_proj: cuda:1 [Q] scales min/max/std: 0.005405/0.019562/0.000778
 !!  - self_attn.o_proj: cuda:1 [Q] scales min/max/std: 0.006836/0.027542/0.000661
 !!  - cache device: cuda:1, seq_len: 0
 !! Begin MLP
 !!  - hidden_states: device: cuda:1, shape: [1, 1920, 6656], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000
 !!  - layernorm.weight: device: cuda:1, shape: [6656], dtype: float16, min: 0.012085, max: 0.296875, std: 0.012276 eps: 0.00000100
 !!  - mlp.gate_proj: cuda:1 [Q] scales min/max/std: 0.006771/0.052094/0.001257
 !!  - mlp.up_proj: cuda:1 [Q] scales min/max/std: 0.006607/0.030212/0.000802
 !!  - mlp.down_proj: cuda:1 [Q] scales min/max/std: 0.007618/0.063660/0.001120
 !!  - method: normal
 !! Begin decoder 27
 !! Begin self-attention
 !!  - hidden_states: device: cuda:1, shape: [1, 1920, 6656], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000
 !!  - attn_mask: device: cuda:1, shape: [1, 1, 1920, 1920], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000
 !!  - layernorm.weight: device: cuda:1, shape: [6656], dtype: float16, min: 0.041504, max: 0.664062, std: 0.013268 eps: 0.00000100
 !!  - self_attn.q_proj: cuda:1 [Q] scales min/max/std: 0.003246/0.061188/0.004597
 !!  - self_attn.k_proj: cuda:1 [Q] scales min/max/std: 0.002808/0.042969/0.003384
 !!  - self_attn.v_proj: cuda:1 [Q] scales min/max/std: 0.004257/0.020767/0.000847
 !!  - self_attn.o_proj: cuda:1 [Q] scales min/max/std: 0.006737/0.140137/0.002386
 !!  - cache device: cuda:1, seq_len: 0
 !! Begin MLP
 !!  - hidden_states: device: cuda:1, shape: [1, 1920, 6656], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000
 !!  - layernorm.weight: device: cuda:1, shape: [6656], dtype: float16, min: 0.008850, max: 0.306641, std: 0.012772 eps: 0.00000100
 !!  - mlp.gate_proj: cuda:1 [Q] scales min/max/std: 0.006397/0.040039/0.001224
 !!  - mlp.up_proj: cuda:1 [Q] scales min/max/std: 0.006512/0.039978/0.000906
 !!  - mlp.down_proj: cuda:1 [Q] scales min/max/std: 0.007519/0.074219/0.001472
 !!  - method: normal
 !! Begin decoder 28
 !! Begin self-attention
 !!  - hidden_states: device: cuda:1, shape: [1, 1920, 6656], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000
 !!  - attn_mask: device: cuda:1, shape: [1, 1, 1920, 1920], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000
 !!  - layernorm.weight: device: cuda:1, shape: [6656], dtype: float16, min: 0.036865, max: 0.734375, std: 0.013893 eps: 0.00000100
 !!  - self_attn.q_proj: cuda:1 [Q] scales min/max/std: 0.003670/0.061188/0.004440
 !!  - self_attn.k_proj: cuda:1 [Q] scales min/max/std: 0.003345/0.039459/0.003338
 !!  - self_attn.v_proj: cuda:1 [Q] scales min/max/std: 0.005760/0.014290/0.000641
 !!  - self_attn.o_proj: cuda:1 [Q] scales min/max/std: 0.006886/0.062225/0.000916
 !!  - cache device: cuda:1, seq_len: 0
 !! Begin MLP
 !!  - hidden_states: device: cuda:1, shape: [1, 1920, 6656], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000
 !!  - layernorm.weight: device: cuda:1, shape: [6656], dtype: float16, min: 0.003601, max: 0.314453, std: 0.013245 eps: 0.00000100
 !!  - mlp.gate_proj: cuda:1 [Q] scales min/max/std: 0.006756/0.039124/0.001158
 !!  - mlp.up_proj: cuda:1 [Q] scales min/max/std: 0.006626/0.041290/0.000786
 !!  - mlp.down_proj: cuda:1 [Q] scales min/max/std: 0.007584/0.099976/0.001548
 !!  - method: normal
 !! Begin decoder 29
 !! Begin self-attention
 !!  - hidden_states: device: cuda:1, shape: [1, 1920, 6656], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000
 !!  - attn_mask: device: cuda:1, shape: [1, 1, 1920, 1920], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000
 !!  - layernorm.weight: device: cuda:1, shape: [6656], dtype: float16, min: 0.030640, max: 0.750000, std: 0.014641 eps: 0.00000100
 !!  - self_attn.q_proj: cuda:1 [Q] scales min/max/std: 0.003246/0.070435/0.004562
 !!  - self_attn.k_proj: cuda:1 [Q] scales min/max/std: 0.002693/0.036469/0.003307
 !!  - self_attn.v_proj: cuda:1 [Q] scales min/max/std: 0.006248/0.022598/0.000674
 !!  - self_attn.o_proj: cuda:1 [Q] scales min/max/std: 0.006870/0.058594/0.001038
 !!  - cache device: cuda:1, seq_len: 0
 !! Begin MLP
 !!  - hidden_states: device: cuda:1, shape: [1, 1920, 6656], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000
 !!  - layernorm.weight: device: cuda:1, shape: [6656], dtype: float16, min: -0.000912, max: 0.320312, std: 0.013840 eps: 0.00000100
 !!  - mlp.gate_proj: cuda:1 [Q] scales min/max/std: 0.006821/0.035034/0.001091
 !!  - mlp.up_proj: cuda:1 [Q] scales min/max/std: 0.006786/0.052734/0.000846
 !!  - mlp.down_proj: cuda:1 [Q] scales min/max/std: 0.007504/0.074219/0.001188
 !!  - method: normal
 !! Begin decoder 30
 !! Begin self-attention
 !!  - hidden_states: device: cuda:1, shape: [1, 1920, 6656], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000
 !!  - attn_mask: device: cuda:1, shape: [1, 1, 1920, 1920], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000
 !!  - layernorm.weight: device: cuda:1, shape: [6656], dtype: float16, min: 0.032715, max: 0.750000, std: 0.015472 eps: 0.00000100
 !!  - self_attn.q_proj: cuda:1 [Q] scales min/max/std: 0.003101/0.059509/0.004513
 !!  - self_attn.k_proj: cuda:1 [Q] scales min/max/std: 0.003157/0.040894/0.003395
 !!  - self_attn.v_proj: cuda:1 [Q] scales min/max/std: 0.005550/0.015137/0.000741
 !!  - self_attn.o_proj: cuda:1 [Q] scales min/max/std: 0.006802/0.122681/0.001928
 !!  - cache device: cuda:1, seq_len: 0
 !! Begin MLP
 !!  - hidden_states: device: cuda:1, shape: [1, 1920, 6656], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000
 !!  - layernorm.weight: device: cuda:1, shape: [6656], dtype: float16, min: -0.002228, max: 0.328125, std: 0.014160 eps: 0.00000100
 !!  - mlp.gate_proj: cuda:1 [Q] scales min/max/std: 0.006966/0.035156/0.001039
 !!  - mlp.up_proj: cuda:1 [Q] scales min/max/std: 0.006626/0.033020/0.000750
 !!  - mlp.down_proj: cuda:1 [Q] scales min/max/std: 0.007423/0.075500/0.001152
 !!  - method: normal
 !! Begin decoder 31
 !! Begin self-attention
 !!  - hidden_states: device: cuda:1, shape: [1, 1920, 6656], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000
 !!  - attn_mask: device: cuda:1, shape: [1, 1, 1920, 1920], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000
 !!  - layernorm.weight: device: cuda:1, shape: [6656], dtype: float16, min: 0.023193, max: 0.769531, std: 0.015266 eps: 0.00000100
 !!  - self_attn.q_proj: cuda:1 [Q] scales min/max/std: 0.003410/0.072510/0.004620
 !!  - self_attn.k_proj: cuda:1 [Q] scales min/max/std: 0.002954/0.043610/0.003323
 !!  - self_attn.v_proj: cuda:1 [Q] scales min/max/std: 0.005630/0.015366/0.000771
 !!  - self_attn.o_proj: cuda:1 [Q] scales min/max/std: 0.006950/0.072021/0.001338
 !!  - cache device: cuda:1, seq_len: 0
 !! Begin MLP
 !!  - hidden_states: device: cuda:1, shape: [1, 1920, 6656], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000
 !!  - layernorm.weight: device: cuda:1, shape: [6656], dtype: float16, min: -0.003433, max: 0.335938, std: 0.014183 eps: 0.00000100
 !!  - mlp.gate_proj: cuda:1 [Q] scales min/max/std: 0.007114/0.036194/0.001136
 !!  - mlp.up_proj: cuda:1 [Q] scales min/max/std: 0.006657/0.045319/0.000835
 !!  - mlp.down_proj: cuda:1 [Q] scales min/max/std: 0.007423/0.084534/0.001247
 !!  - method: normal
 !! Begin decoder 32
 !! Begin self-attention
 !!  - hidden_states: device: cuda:1, shape: [1, 1920, 6656], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000
 !!  - attn_mask: device: cuda:1, shape: [1, 1, 1920, 1920], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000
 !!  - layernorm.weight: device: cuda:1, shape: [6656], dtype: float16, min: 0.031006, max: 0.726562, std: 0.015465 eps: 0.00000100
 !!  - self_attn.q_proj: cuda:1 [Q] scales min/max/std: 0.003475/0.058594/0.004498
 !!  - self_attn.k_proj: cuda:1 [Q] scales min/max/std: 0.003166/0.039581/0.003384
 !!  - self_attn.v_proj: cuda:1 [Q] scales min/max/std: 0.004539/0.016357/0.000813
 !!  - self_attn.o_proj: cuda:1 [Q] scales min/max/std: 0.006935/0.147949/0.002407
 !!  - cache device: cuda:1, seq_len: 0
 !! Begin MLP
 !!  - hidden_states: device: cuda:1, shape: [1, 1920, 6656], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000
 !!  - layernorm.weight: device: cuda:1, shape: [6656], dtype: float16, min: 0.001793, max: 0.345703, std: 0.014183 eps: 0.00000100
 !!  - mlp.gate_proj: cuda:1 [Q] scales min/max/std: 0.006901/0.047150/0.001182
 !!  - mlp.up_proj: cuda:1 [Q] scales min/max/std: 0.006657/0.044800/0.000997
 !!  - mlp.down_proj: cuda:1 [Q] scales min/max/std: 0.007603/0.066406/0.001269
 !!  - method: normal
 !! Begin decoder 33
 !! Begin self-attention
 !!  - hidden_states: device: cuda:1, shape: [1, 1920, 6656], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000
 !!  - attn_mask: device: cuda:1, shape: [1, 1, 1920, 1920], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000
 !!  - layernorm.weight: device: cuda:1, shape: [6656], dtype: float16, min: 0.031982, max: 0.828125, std: 0.016769 eps: 0.00000100
 !!  - self_attn.q_proj: cuda:1 [Q] scales min/max/std: 0.003117/0.058472/0.004299
 !!  - self_attn.k_proj: cuda:1 [Q] scales min/max/std: 0.002913/0.041931/0.003408
 !!  - self_attn.v_proj: cuda:1 [Q] scales min/max/std: 0.005207/0.013672/0.000735
 !!  - self_attn.o_proj: cuda:1 [Q] scales min/max/std: 0.006916/0.049622/0.000923
 !!  - cache device: cuda:1, seq_len: 0
 !! Begin MLP
 !!  - hidden_states: device: cuda:1, shape: [1, 1920, 6656], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000
 !!  - layernorm.weight: device: cuda:1, shape: [6656], dtype: float16, min: 0.005981, max: 0.375000, std: 0.014603 eps: 0.00000100
 !!  - mlp.gate_proj: cuda:1 [Q] scales min/max/std: 0.006981/0.073547/0.001303
 !!  - mlp.up_proj: cuda:1 [Q] scales min/max/std: 0.006737/0.063049/0.000965
 !!  - mlp.down_proj: cuda:1 [Q] scales min/max/std: 0.007389/0.092957/0.001473
 !!  - method: normal
 !! Begin decoder 34
 !! Begin self-attention
 !!  - hidden_states: device: cuda:1, shape: [1, 1920, 6656], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000
 !!  - attn_mask: device: cuda:1, shape: [1, 1, 1920, 1920], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000
 !!  - layernorm.weight: device: cuda:1, shape: [6656], dtype: float16, min: 0.031494, max: 0.785156, std: 0.016815 eps: 0.00000100
 !!  - self_attn.q_proj: cuda:1 [Q] scales min/max/std: 0.003500/0.060822/0.004780
 !!  - self_attn.k_proj: cuda:1 [Q] scales min/max/std: 0.003117/0.043488/0.003405
 !!  - self_attn.v_proj: cuda:1 [Q] scales min/max/std: 0.005714/0.016998/0.000655
 !!  - self_attn.o_proj: cuda:1 [Q] scales min/max/std: 0.006851/0.058990/0.001354
 !!  - cache device: cuda:1, seq_len: 0
 !! Begin MLP
 !!  - hidden_states: device: cuda:1, shape: [1, 1920, 6656], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000
 !!  - layernorm.weight: device: cuda:1, shape: [6656], dtype: float16, min: -0.000751, max: 0.343750, std: 0.015251 eps: 0.00000100
 !!  - mlp.gate_proj: cuda:1 [Q] scales min/max/std: 0.006821/0.052094/0.001061
 !!  - mlp.up_proj: cuda:1 [Q] scales min/max/std: 0.006737/0.075500/0.001045
 !!  - mlp.down_proj: cuda:1 [Q] scales min/max/std: 0.007488/0.100403/0.001640
 !!  - method: normal
 !! Begin decoder 35
 !! Begin self-attention
 !!  - hidden_states: device: cuda:1, shape: [1, 1920, 6656], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000
 !!  - attn_mask: device: cuda:1, shape: [1, 1, 1920, 1920], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000
 !!  - layernorm.weight: device: cuda:1, shape: [6656], dtype: float16, min: 0.032715, max: 0.804688, std: 0.017563 eps: 0.00000100
 !!  - self_attn.q_proj: cuda:1 [Q] scales min/max/std: 0.002930/0.060150/0.004387
 !!  - self_attn.k_proj: cuda:1 [Q] scales min/max/std: 0.002848/0.042847/0.003531
 !!  - self_attn.v_proj: cuda:1 [Q] scales min/max/std: 0.004963/0.013313/0.000789
 !!  - self_attn.o_proj: cuda:1 [Q] scales min/max/std: 0.007030/0.063660/0.001132
 !!  - cache device: cuda:1, seq_len: 0
 !! Begin MLP
 !!  - hidden_states: device: cuda:1, shape: [1, 1920, 6656], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000
 !!  - layernorm.weight: device: cuda:1, shape: [6656], dtype: float16, min: 0.004547, max: 0.353516, std: 0.015144 eps: 0.00000100
 !!  - mlp.gate_proj: cuda:1 [Q] scales min/max/std: 0.006901/0.048370/0.001162
 !!  - mlp.up_proj: cuda:1 [Q] scales min/max/std: 0.006821/0.033997/0.000875
 !!  - mlp.down_proj: cuda:1 [Q] scales min/max/std: 0.007504/0.095032/0.001330
 !!  - method: normal
 !! Begin decoder 36
 !! Begin self-attention
 !!  - hidden_states: device: cuda:1, shape: [1, 1920, 6656], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000
 !!  - attn_mask: device: cuda:1, shape: [1, 1, 1920, 1920], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000
 !!  - layernorm.weight: device: cuda:1, shape: [6656], dtype: float16, min: 0.029785, max: 0.839844, std: 0.017242 eps: 0.00000100
 !!  - self_attn.q_proj: cuda:1 [Q] scales min/max/std: 0.003223/0.058716/0.004532
 !!  - self_attn.k_proj: cuda:1 [Q] scales min/max/std: 0.002962/0.041290/0.003490
 !!  - self_attn.v_proj: cuda:1 [Q] scales min/max/std: 0.004574/0.019562/0.000918
 !!  - self_attn.o_proj: cuda:1 [Q] scales min/max/std: 0.007195/0.117737/0.001760
 !!  - cache device: cuda:1, seq_len: 0
 !! Begin MLP
 !!  - hidden_states: device: cuda:1, shape: [1, 1920, 6656], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000
 !!  - layernorm.weight: device: cuda:1, shape: [6656], dtype: float16, min: 0.010925, max: 0.359375, std: 0.015305 eps: 0.00000100
 !!  - mlp.gate_proj: cuda:1 [Q] scales min/max/std: 0.007030/0.061859/0.001213
 !!  - mlp.up_proj: cuda:1 [Q] scales min/max/std: 0.006771/0.048553/0.000885
 !!  - mlp.down_proj: cuda:1 [Q] scales min/max/std: 0.007374/0.075012/0.001202
 !!  - method: normal
 !! Begin decoder 37
 !! Begin self-attention
 !!  - hidden_states: device: cuda:1, shape: [1, 1920, 6656], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000
 !!  - attn_mask: device: cuda:1, shape: [1, 1, 1920, 1920], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000
 !!  - layernorm.weight: device: cuda:1, shape: [6656], dtype: float16, min: 0.025879, max: 0.828125, std: 0.017578 eps: 0.00000100
 !!  - self_attn.q_proj: cuda:1 [Q] scales min/max/std: 0.003199/0.067688/0.004490
 !!  - self_attn.k_proj: cuda:1 [Q] scales min/max/std: 0.003044/0.045441/0.003639
 !!  - self_attn.v_proj: cuda:1 [Q] scales min/max/std: 0.006004/0.021515/0.000705
 !!  - self_attn.o_proj: cuda:1 [Q] scales min/max/std: 0.006870/0.085144/0.001255
 !!  - cache device: cuda:1, seq_len: 0
 !! Begin MLP
 !!  - hidden_states: device: cuda:1, shape: [1, 1920, 6656], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000
 !!  - layernorm.weight: device: cuda:1, shape: [6656], dtype: float16, min: 0.009827, max: 0.361328, std: 0.015839 eps: 0.00000100
 !!  - mlp.gate_proj: cuda:1 [Q] scales min/max/std: 0.007210/0.050079/0.001101
 !!  - mlp.up_proj: cuda:1 [Q] scales min/max/std: 0.006821/0.032745/0.000821
 !!  - mlp.down_proj: cuda:1 [Q] scales min/max/std: 0.007633/0.119019/0.001538
 !!  - method: normal
 !! Begin decoder 38
 !! Begin self-attention
 !!  - hidden_states: device: cuda:1, shape: [1, 1920, 6656], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000
 !!  - attn_mask: device: cuda:1, shape: [1, 1, 1920, 1920], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000
 !!  - layernorm.weight: device: cuda:1, shape: [6656], dtype: float16, min: 0.027100, max: 0.843750, std: 0.018066 eps: 0.00000100
 !!  - self_attn.q_proj: cuda:1 [Q] scales min/max/std: 0.003328/0.055847/0.004547
 !!  - self_attn.k_proj: cuda:1 [Q] scales min/max/std: 0.003012/0.039703/0.003664
 !!  - self_attn.v_proj: cuda:1 [Q] scales min/max/std: 0.005695/0.021423/0.000836
 !!  - self_attn.o_proj: cuda:1 [Q] scales min/max/std: 0.007195/0.046753/0.001000
 !!  - cache device: cuda:1, seq_len: 0
 !! Begin MLP
 !!  - hidden_states: device: cuda:1, shape: [1, 1920, 6656], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000
 !!  - layernorm.weight: device: cuda:1, shape: [6656], dtype: float16, min: 0.005066, max: 0.382812, std: 0.016144 eps: 0.00000100
 !!  - mlp.gate_proj: cuda:1 [Q] scales min/max/std: 0.007160/0.043304/0.001045
 !!  - mlp.up_proj: cuda:1 [Q] scales min/max/std: 0.006786/0.029694/0.000715
 !!  - mlp.down_proj: cuda:1 [Q] scales min/max/std: 0.007095/0.076050/0.001014
 !!  - method: normal
 !! Begin decoder 39
 !! Begin self-attention
 !!  - hidden_states: device: cuda:1, shape: [1, 1920, 6656], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000
 !!  - attn_mask: device: cuda:1, shape: [1, 1, 1920, 1920], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000
 !!  - layernorm.weight: device: cuda:1, shape: [6656], dtype: float16, min: 0.029907, max: 0.808594, std: 0.017258 eps: 0.00000100
 !!  - self_attn.q_proj: cuda:1 [Q] scales min/max/std: 0.002848/0.063660/0.004723
 !!  - self_attn.k_proj: cuda:1 [Q] scales min/max/std: 0.002539/0.042847/0.003702
 !!  - self_attn.v_proj: cuda:1 [Q] scales min/max/std: 0.005501/0.018784/0.000795
 !!  - self_attn.o_proj: cuda:1 [Q] scales min/max/std: 0.007030/0.089844/0.001469
 !!  - cache device: cuda:1, seq_len: 0
 !! Begin MLP
 !!  - hidden_states: device: cuda:1, shape: [1, 1920, 6656], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000
 !!  - layernorm.weight: device: cuda:1, shape: [6656], dtype: float16, min: 0.022949, max: 0.371094, std: 0.016068 eps: 0.00000100
 !!  - mlp.gate_proj: cuda:1 [Q] scales min/max/std: 0.006771/0.061584/0.001096
 !!  - mlp.up_proj: cuda:1 [Q] scales min/max/std: 0.006851/0.065613/0.000909
 !!  - mlp.down_proj: cuda:1 [Q] scales min/max/std: 0.007584/0.081482/0.001101
 !!  - method: normal
 !! Begin decoder 40
 !! Begin self-attention
 !!  - hidden_states: device: cuda:1, shape: [1, 1920, 6656], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000
 !!  - attn_mask: device: cuda:1, shape: [1, 1, 1920, 1920], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000
 !!  - layernorm.weight: device: cuda:1, shape: [6656], dtype: float16, min: 0.031982, max: 0.898438, std: 0.019562 eps: 0.00000100
 !!  - self_attn.q_proj: cuda:1 [Q] scales min/max/std: 0.003418/0.057556/0.004429
 !!  - self_attn.k_proj: cuda:1 [Q] scales min/max/std: 0.002800/0.037903/0.003567
 !!  - self_attn.v_proj: cuda:1 [Q] scales min/max/std: 0.005974/0.018036/0.000762
 !!  - self_attn.o_proj: cuda:1 [Q] scales min/max/std: 0.007046/0.041718/0.000777
 !!  - cache device: cuda:1, seq_len: 0
 !! Begin MLP
 !!  - hidden_states: device: cuda:1, shape: [1, 1920, 6656], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000
 !!  - layernorm.weight: device: cuda:1, shape: [6656], dtype: float16, min: 0.008423, max: 0.394531, std: 0.016434 eps: 0.00000100
 !!  - mlp.gate_proj: cuda:1 [Q] scales min/max/std: 0.007114/0.048431/0.001036
 !!  - mlp.up_proj: cuda:1 [Q] scales min/max/std: 0.006851/0.026566/0.000744
 !!  - mlp.down_proj: cuda:1 [Q] scales min/max/std: 0.007633/0.073181/0.000994
 !!  - method: normal
 !! Begin decoder 41
 !! Begin self-attention
 !!  - hidden_states: device: cuda:1, shape: [1, 1920, 6656], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000
 !!  - attn_mask: device: cuda:1, shape: [1, 1, 1920, 1920], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000
 !!  - layernorm.weight: device: cuda:1, shape: [6656], dtype: float16, min: 0.031250, max: 0.867188, std: 0.019302 eps: 0.00000100
 !!  - self_attn.q_proj: cuda:1 [Q] scales min/max/std: 0.002848/0.075500/0.005047
 !!  - self_attn.k_proj: cuda:1 [Q] scales min/max/std: 0.002645/0.038727/0.003561
 !!  - self_attn.v_proj: cuda:1 [Q] scales min/max/std: 0.005421/0.020111/0.000769
 !!  - self_attn.o_proj: cuda:1 [Q] scales min/max/std: 0.007030/0.158813/0.002529
 !!  - cache device: cuda:1, seq_len: 0
 !! Begin MLP
 !!  - hidden_states: device: cuda:1, shape: [1, 1920, 6656], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000
 !!  - layernorm.weight: device: cuda:1, shape: [6656], dtype: float16, min: 0.015076, max: 0.402344, std: 0.016510 eps: 0.00000100
 !!  - mlp.gate_proj: cuda:1 [Q] scales min/max/std: 0.007195/0.052460/0.001122
 !!  - mlp.up_proj: cuda:1 [Q] scales min/max/std: 0.006886/0.036011/0.000782
 !!  - mlp.down_proj: cuda:1 [Q] scales min/max/std: 0.007488/0.067078/0.000942
 !!  - method: normal
 !! Begin decoder 42
 !! Begin self-attention
 !!  - hidden_states: device: cuda:1, shape: [1, 1920, 6656], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000
 !!  - attn_mask: device: cuda:1, shape: [1, 1, 1920, 1920], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000
 !!  - layernorm.weight: device: cuda:1, shape: [6656], dtype: float16, min: 0.028442, max: 0.855469, std: 0.018463 eps: 0.00000100
 !!  - self_attn.q_proj: cuda:1 [Q] scales min/max/std: 0.003141/0.062225/0.004654
 !!  - self_attn.k_proj: cuda:1 [Q] scales min/max/std: 0.002987/0.042847/0.003576
 !!  - self_attn.v_proj: cuda:1 [Q] scales min/max/std: 0.005077/0.019669/0.000777
 !!  - self_attn.o_proj: cuda:1 [Q] scales min/max/std: 0.006916/0.068726/0.001397
 !!  - cache device: cuda:1, seq_len: 0
 !! Begin MLP
 !!  - hidden_states: device: cuda:1, shape: [1, 1920, 6656], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000
 !!  - layernorm.weight: device: cuda:1, shape: [6656], dtype: float16, min: 0.013184, max: 0.404297, std: 0.016388 eps: 0.00000100
 !!  - mlp.gate_proj: cuda:1 [Q] scales min/max/std: 0.007179/0.059631/0.001117
 !!  - mlp.up_proj: cuda:1 [Q] scales min/max/std: 0.006836/0.029617/0.000745
 !!  - mlp.down_proj: cuda:1 [Q] scales min/max/std: 0.007080/0.058197/0.000825
 !!  - method: normal
 !! Begin decoder 43
 !! Begin self-attention
 !!  - hidden_states: device: cuda:1, shape: [1, 1920, 6656], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000
 !!  - attn_mask: device: cuda:1, shape: [1, 1, 1920, 1920], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000
 !!  - layernorm.weight: device: cuda:1, shape: [6656], dtype: float16, min: 0.045166, max: 0.882812, std: 0.020325 eps: 0.00000100
 !!  - self_attn.q_proj: cuda:1 [Q] scales min/max/std: 0.003279/0.056366/0.004562
 !!  - self_attn.k_proj: cuda:1 [Q] scales min/max/std: 0.002865/0.035950/0.003454
 !!  - self_attn.v_proj: cuda:1 [Q] scales min/max/std: 0.005421/0.019440/0.000834
 !!  - self_attn.o_proj: cuda:1 [Q] scales min/max/std: 0.007195/0.037384/0.000866
 !!  - cache device: cuda:1, seq_len: 0
 !! Begin MLP
 !!  - hidden_states: device: cuda:1, shape: [1, 1920, 6656], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000
 !!  - layernorm.weight: device: cuda:1, shape: [6656], dtype: float16, min: 0.034912, max: 0.408203, std: 0.015991 eps: 0.00000100
 !!  - mlp.gate_proj: cuda:1 [Q] scales min/max/std: 0.007275/0.040222/0.001178
 !!  - mlp.up_proj: cuda:1 [Q] scales min/max/std: 0.006901/0.043488/0.000858
 !!  - mlp.down_proj: cuda:1 [Q] scales min/max/std: 0.007439/0.083313/0.001134
 !!  - method: normal
 !! Begin decoder 44
 !! Begin self-attention
 !!  - hidden_states: device: cuda:1, shape: [1, 1920, 6656], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000
 !!  - attn_mask: device: cuda:1, shape: [1, 1, 1920, 1920], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000
 !!  - layernorm.weight: device: cuda:1, shape: [6656], dtype: float16, min: 0.038818, max: 0.804688, std: 0.019043 eps: 0.00000100
 !!  - self_attn.q_proj: cuda:1 [Q] scales min/max/std: 0.002954/0.067444/0.004791
 !!  - self_attn.k_proj: cuda:1 [Q] scales min/max/std: 0.002790/0.041138/0.003475
 !!  - self_attn.v_proj: cuda:1 [Q] scales min/max/std: 0.006023/0.017288/0.000857
 !!  - self_attn.o_proj: cuda:1 [Q] scales min/max/std: 0.007145/0.081787/0.001302
 !!  - cache device: cuda:1, seq_len: 0
 !! Begin MLP
 !!  - hidden_states: device: cuda:1, shape: [1, 1920, 6656], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000
 !!  - layernorm.weight: device: cuda:1, shape: [6656], dtype: float16, min: 0.037354, max: 0.406250, std: 0.016190 eps: 0.00000100
 !!  - mlp.gate_proj: cuda:1 [Q] scales min/max/std: 0.006771/0.059753/0.001092
 !!  - mlp.up_proj: cuda:1 [Q] scales min/max/std: 0.006737/0.027084/0.000705
 !!  - mlp.down_proj: cuda:1 [Q] scales min/max/std: 0.005371/0.114075/0.001456
 !!  - method: normal
 !! Begin decoder 45
 !! Begin self-attention
 !!  - hidden_states: device: cuda:1, shape: [1, 1920, 6656], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000
 !!  - attn_mask: device: cuda:1, shape: [1, 1, 1920, 1920], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000
 !!  - layernorm.weight: device: cuda:1, shape: [6656], dtype: float16, min: 0.033691, max: 0.839844, std: 0.020630 eps: 0.00000100
 !!  - self_attn.q_proj: cuda:1 [Q] scales min/max/std: 0.003304/0.061340/0.004360
 !!  - self_attn.k_proj: cuda:1 [Q] scales min/max/std: 0.002890/0.035950/0.003561
 !!  - self_attn.v_proj: cuda:1 [Q] scales min/max/std: 0.004173/0.022720/0.000947
 !!  - self_attn.o_proj: cuda:1 [Q] scales min/max/std: 0.006981/0.033539/0.000868
 !!  - cache device: cuda:1, seq_len: 0
 !! Begin MLP
 !!  - hidden_states: device: cuda:1, shape: [1, 1920, 6656], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000
 !!  - layernorm.weight: device: cuda:1, shape: [6656], dtype: float16, min: 0.023560, max: 0.417969, std: 0.016037 eps: 0.00000100
 !!  - mlp.gate_proj: cuda:1 [Q] scales min/max/std: 0.007244/0.053528/0.001163
 !!  - mlp.up_proj: cuda:1 [Q] scales min/max/std: 0.006771/0.030991/0.000765
 !!  - mlp.down_proj: cuda:1 [Q] scales min/max/std: 0.007423/0.060822/0.001042
 !!  - method: normal
 !! Begin decoder 46
 !! Begin self-attention
 !!  - hidden_states: device: cuda:1, shape: [1, 1920, 6656], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000
 !!  - attn_mask: device: cuda:1, shape: [1, 1, 1920, 1920], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000
 !!  - layernorm.weight: device: cuda:1, shape: [6656], dtype: float16, min: 0.034912, max: 0.812500, std: 0.018784 eps: 0.00000100
 !!  - self_attn.q_proj: cuda:1 [Q] scales min/max/std: 0.002995/0.049866/0.004936
 !!  - self_attn.k_proj: cuda:1 [Q] scales min/max/std: 0.002539/0.037384/0.003443
 !!  - self_attn.v_proj: cuda:1 [Q] scales min/max/std: 0.005161/0.013802/0.000974
 !!  - self_attn.o_proj: cuda:1 [Q] scales min/max/std: 0.006935/0.116150/0.001753
 !!  - cache device: cuda:1, seq_len: 0
 !! Begin MLP
 !!  - hidden_states: device: cuda:1, shape: [1, 1920, 6656], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000
 !!  - layernorm.weight: device: cuda:1, shape: [6656], dtype: float16, min: 0.038574, max: 0.423828, std: 0.015762 eps: 0.00000100
 !!  - mlp.gate_proj: cuda:1 [Q] scales min/max/std: 0.006966/0.081482/0.001361
 !!  - mlp.up_proj: cuda:1 [Q] scales min/max/std: 0.006771/0.050446/0.000865
 !!  - mlp.down_proj: cuda:1 [Q] scales min/max/std: 0.007603/0.110168/0.001574
 !!  - method: normal
 !! Begin decoder 47
 !! Begin self-attention
 !!  - hidden_states: device: cuda:1, shape: [1, 1920, 6656], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000
 !!  - attn_mask: device: cuda:1, shape: [1, 1, 1920, 1920], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000
 !!  - layernorm.weight: device: cuda:1, shape: [6656], dtype: float16, min: 0.036133, max: 0.839844, std: 0.019562 eps: 0.00000100
 !!  - self_attn.q_proj: cuda:1 [Q] scales min/max/std: 0.002832/0.056763/0.004982
 !!  - self_attn.k_proj: cuda:1 [Q] scales min/max/std: 0.002758/0.040100/0.003574
 !!  - self_attn.v_proj: cuda:1 [Q] scales min/max/std: 0.004833/0.013931/0.000822
 !!  - self_attn.o_proj: cuda:1 [Q] scales min/max/std: 0.007210/0.078125/0.001239
 !!  - cache device: cuda:1, seq_len: 0
 !! Begin MLP
 !!  - hidden_states: device: cuda:1, shape: [1, 1920, 6656], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000
 !!  - layernorm.weight: device: cuda:1, shape: [6656], dtype: float16, min: 0.036621, max: 0.441406, std: 0.015945 eps: 0.00000100
 !!  - mlp.gate_proj: cuda:1 [Q] scales min/max/std: 0.007046/0.041809/0.001195
 !!  - mlp.up_proj: cuda:1 [Q] scales min/max/std: 0.006657/0.032410/0.000795
 !!  - mlp.down_proj: cuda:1 [Q] scales min/max/std: 0.007046/0.097656/0.001348
 !!  - method: normal
 !! Begin decoder 48
 !! Begin self-attention
 !!  - hidden_states: device: cuda:1, shape: [1, 1920, 6656], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000
 !!  - attn_mask: device: cuda:1, shape: [1, 1, 1920, 1920], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000
 !!  - layernorm.weight: device: cuda:1, shape: [6656], dtype: float16, min: 0.036133, max: 0.800781, std: 0.019073 eps: 0.00000100
 !!  - self_attn.q_proj: cuda:1 [Q] scales min/max/std: 0.003084/0.055328/0.004616
 !!  - self_attn.k_proj: cuda:1 [Q] scales min/max/std: 0.002783/0.037445/0.003414
 !!  - self_attn.v_proj: cuda:1 [Q] scales min/max/std: 0.004135/0.015594/0.000752
 !!  - self_attn.o_proj: cuda:1 [Q] scales min/max/std: 0.007145/0.035614/0.000791
 !!  - cache device: cuda:1, seq_len: 0
 !! Begin MLP
 !!  - hidden_states: device: cuda:1, shape: [1, 1920, 6656], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000
 !!  - layernorm.weight: device: cuda:1, shape: [6656], dtype: float16, min: 0.036133, max: 0.437500, std: 0.016052 eps: 0.00000100
 !!  - mlp.gate_proj: cuda:1 [Q] scales min/max/std: 0.007179/0.036835/0.001137
 !!  - mlp.up_proj: cuda:1 [Q] scales min/max/std: 0.006836/0.042114/0.000801
 !!  - mlp.down_proj: cuda:1 [Q] scales min/max/std: 0.006218/0.084351/0.001168
 !!  - method: normal
 !! Begin decoder 49
 !! Begin self-attention
 !!  - hidden_states: device: cuda:1, shape: [1, 1920, 6656], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000
 !!  - attn_mask: device: cuda:1, shape: [1, 1, 1920, 1920], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000
 !!  - layernorm.weight: device: cuda:1, shape: [6656], dtype: float16, min: 0.043213, max: 0.902344, std: 0.019379 eps: 0.00000100
 !!  - self_attn.q_proj: cuda:1 [Q] scales min/max/std: 0.003296/0.063049/0.004436
 !!  - self_attn.k_proj: cuda:1 [Q] scales min/max/std: 0.002987/0.038025/0.003288
 !!  - self_attn.v_proj: cuda:1 [Q] scales min/max/std: 0.005974/0.023300/0.001044
 !!  - self_attn.o_proj: cuda:1 [Q] scales min/max/std: 0.007553/0.071106/0.001290
 !!  - cache device: cuda:1, seq_len: 0
 !! Begin MLP
 !!  - hidden_states: device: cuda:1, shape: [1, 1920, 6656], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000
 !!  - layernorm.weight: device: cuda:1, shape: [6656], dtype: float16, min: 0.036377, max: 0.421875, std: 0.016068 eps: 0.00000100
 !!  - mlp.gate_proj: cuda:1 [Q] scales min/max/std: 0.006802/0.048309/0.001147
 !!  - mlp.up_proj: cuda:1 [Q] scales min/max/std: 0.006935/0.041656/0.000799
 !!  - mlp.down_proj: cuda:1 [Q] scales min/max/std: 0.007195/0.068481/0.001121
 !!  - method: normal
 !! Begin decoder 50
 !! Begin self-attention
 !!  - hidden_states: device: cuda:1, shape: [1, 1920, 6656], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000
 !!  - attn_mask: device: cuda:1, shape: [1, 1, 1920, 1920], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000
 !!  - layernorm.weight: device: cuda:1, shape: [6656], dtype: float16, min: 0.037109, max: 0.847656, std: 0.019165 eps: 0.00000100
 !!  - self_attn.q_proj: cuda:1 [Q] scales min/max/std: 0.003109/0.070923/0.004910
 !!  - self_attn.k_proj: cuda:1 [Q] scales min/max/std: 0.002613/0.042694/0.003321
 !!  - self_attn.v_proj: cuda:1 [Q] scales min/max/std: 0.005192/0.015526/0.000921
 !!  - self_attn.o_proj: cuda:1 [Q] scales min/max/std: 0.007454/0.074768/0.001302
 !!  - cache device: cuda:1, seq_len: 0
 !! Begin MLP
 !!  - hidden_states: device: cuda:1, shape: [1, 1920, 6656], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000
 !!  - layernorm.weight: device: cuda:1, shape: [6656], dtype: float16, min: 0.037842, max: 0.433594, std: 0.016251 eps: 0.00000100
 !!  - mlp.gate_proj: cuda:1 [Q] scales min/max/std: 0.007065/0.037750/0.001138
 !!  - mlp.up_proj: cuda:1 [Q] scales min/max/std: 0.006721/0.028976/0.000724
 !!  - mlp.down_proj: cuda:1 [Q] scales min/max/std: 0.006168/0.105713/0.001479
 !!  - method: normal
 !! Begin decoder 51
 !! Begin self-attention
 !!  - hidden_states: device: cuda:1, shape: [1, 1920, 6656], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000
 !!  - attn_mask: device: cuda:1, shape: [1, 1, 1920, 1920], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000
 !!  - layernorm.weight: device: cuda:1, shape: [6656], dtype: float16, min: 0.038330, max: 0.867188, std: 0.019226 eps: 0.00000100
 !!  - self_attn.q_proj: cuda:1 [Q] scales min/max/std: 0.003418/0.064209/0.004681
 !!  - self_attn.k_proj: cuda:1 [Q] scales min/max/std: 0.002613/0.034637/0.003159
 !!  - self_attn.v_proj: cuda:1 [Q] scales min/max/std: 0.003540/0.012924/0.001039
 !!  - self_attn.o_proj: cuda:1 [Q] scales min/max/std: 0.007259/0.076538/0.001249
 !!  - cache device: cuda:1, seq_len: 0
 !! Begin MLP
 !!  - hidden_states: device: cuda:1, shape: [1, 1920, 6656], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000
 !!  - layernorm.weight: device: cuda:1, shape: [6656], dtype: float16, min: 0.044189, max: 0.449219, std: 0.016373 eps: 0.00000100
 !!  - mlp.gate_proj: cuda:1 [Q] scales min/max/std: 0.007080/0.050140/0.001146
 !!  - mlp.up_proj: cuda:1 [Q] scales min/max/std: 0.006935/0.028000/0.000729
 !!  - mlp.down_proj: cuda:1 [Q] scales min/max/std: 0.007469/0.057434/0.001116
 !!  - method: normal
 !! Begin decoder 52
 !! Begin self-attention
 !!  - hidden_states: device: cuda:1, shape: [1, 1920, 6656], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000
 !!  - attn_mask: device: cuda:1, shape: [1, 1, 1920, 1920], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000
 !!  - layernorm.weight: device: cuda:1, shape: [6656], dtype: float16, min: 0.061035, max: 0.679688, std: 0.020767 eps: 0.00000100
 !!  - self_attn.q_proj: cuda:1 [Q] scales min/max/std: 0.002872/0.047668/0.004078
 !!  - self_attn.k_proj: cuda:1 [Q] scales min/max/std: 0.002050/0.034241/0.003626
 !!  - self_attn.v_proj: cuda:1 [Q] scales min/max/std: 0.003979/0.017059/0.000966
 !!  - self_attn.o_proj: cuda:1 [Q] scales min/max/std: 0.006981/0.065491/0.001202
 !!  - cache device: cuda:1, seq_len: 0
 !! Begin MLP
 !!  - hidden_states: device: cuda:1, shape: [1, 1920, 6656], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000
 !!  - layernorm.weight: device: cuda:1, shape: [6656], dtype: float16, min: 0.044189, max: 0.464844, std: 0.016251 eps: 0.00000100
 !!  - mlp.gate_proj: cuda:1 [Q] scales min/max/std: 0.007130/0.061981/0.001325
 !!  - mlp.up_proj: cuda:1 [Q] scales min/max/std: 0.006363/0.082153/0.001008
 !!  - mlp.down_proj: cuda:1 [Q] scales min/max/std: 0.007618/0.079712/0.001740
 !!  - method: normal
 !! Begin decoder 53
 !! Begin self-attention
 !!  - hidden_states: device: cuda:1, shape: [1, 1920, 6656], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000
 !!  - attn_mask: device: cuda:1, shape: [1, 1, 1920, 1920], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000
 !!  - layernorm.weight: device: cuda:1, shape: [6656], dtype: float16, min: 0.060303, max: 0.863281, std: 0.020859 eps: 0.00000100
 !!  - self_attn.q_proj: cuda:1 [Q] scales min/max/std: 0.002743/0.058868/0.003712
 !!  - self_attn.k_proj: cuda:1 [Q] scales min/max/std: 0.002295/0.036194/0.003431
 !!  - self_attn.v_proj: cuda:1 [Q] scales min/max/std: 0.004250/0.014290/0.000860
 !!  - self_attn.o_proj: cuda:1 [Q] scales min/max/std: 0.007015/0.103149/0.001647
 !!  - cache device: cuda:1, seq_len: 0
 !! Begin MLP
 !!  - hidden_states: device: cuda:1, shape: [1, 1920, 6656], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000
 !!  - layernorm.weight: device: cuda:1, shape: [6656], dtype: float16, min: 0.045166, max: 0.443359, std: 0.016922 eps: 0.00000100
 !!  - mlp.gate_proj: cuda:1 [Q] scales min/max/std: 0.007015/0.035278/0.001202
 !!  - mlp.up_proj: cuda:1 [Q] scales min/max/std: 0.006737/0.030212/0.000880
 !!  - mlp.down_proj: cuda:1 [Q] scales min/max/std: 0.007454/0.096619/0.001512
 !!  - method: normal
 !! Begin decoder 54
 !! Begin self-attention
 !!  - hidden_states: device: cuda:1, shape: [1, 1920, 6656], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000
 !!  - attn_mask: device: cuda:1, shape: [1, 1, 1920, 1920], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000
 !!  - layernorm.weight: device: cuda:1, shape: [6656], dtype: float16, min: 0.053711, max: 0.890625, std: 0.020660 eps: 0.00000100
 !!  - self_attn.q_proj: cuda:1 [Q] scales min/max/std: 0.002783/0.047668/0.004032
 !!  - self_attn.k_proj: cuda:1 [Q] scales min/max/std: 0.002377/0.036774/0.003460
 !!  - self_attn.v_proj: cuda:1 [Q] scales min/max/std: 0.005142/0.014290/0.000920
 !!  - self_attn.o_proj: cuda:1 [Q] scales min/max/std: 0.007534/0.093506/0.001501
 !!  - cache device: cuda:1, seq_len: 0
 !! Begin MLP
 !!  - hidden_states: device: cuda:1, shape: [1, 1920, 6656], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000
 !!  - layernorm.weight: device: cuda:1, shape: [6656], dtype: float16, min: 0.054199, max: 0.464844, std: 0.017532 eps: 0.00000100
 !!  - mlp.gate_proj: cuda:1 [Q] scales min/max/std: 0.007244/0.093079/0.001365
 !!  - mlp.up_proj: cuda:1 [Q] scales min/max/std: 0.006886/0.059692/0.000940
 !!  - mlp.down_proj: cuda:1 [Q] scales min/max/std: 0.007683/0.149780/0.002497
 !!  - method: normal
 !! Begin decoder 55
 !! Begin self-attention
 !!  - hidden_states: device: cuda:1, shape: [1, 1920, 6656], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000
 !!  - attn_mask: device: cuda:1, shape: [1, 1, 1920, 1920], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000
 !!  - layernorm.weight: device: cuda:1, shape: [6656], dtype: float16, min: 0.058105, max: 0.937500, std: 0.021545 eps: 0.00000100
 !!  - self_attn.q_proj: cuda:1 [Q] scales min/max/std: 0.002678/0.043610/0.003986
 !!  - self_attn.k_proj: cuda:1 [Q] scales min/max/std: 0.002100/0.040741/0.003601
 !!  - self_attn.v_proj: cuda:1 [Q] scales min/max/std: 0.005127/0.014778/0.001210
 !!  - self_attn.o_proj: cuda:1 [Q] scales min/max/std: 0.007469/0.052094/0.001225
 !!  - cache device: cuda:1, seq_len: 0
 !! Begin MLP
 !!  - hidden_states: device: cuda:1, shape: [1, 1920, 6656], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000
 !!  - layernorm.weight: device: cuda:1, shape: [6656], dtype: float16, min: 0.054199, max: 0.480469, std: 0.017639 eps: 0.00000100
 !!  - mlp.gate_proj: cuda:1 [Q] scales min/max/std: 0.007080/0.041656/0.001164
 !!  - mlp.up_proj: cuda:1 [Q] scales min/max/std: 0.006721/0.044983/0.000927
 !!  - mlp.down_proj: cuda:1 [Q] scales min/max/std: 0.007618/0.155151/0.002712
 !!  - method: normal
 !! Begin decoder 56
 !! Begin self-attention
 !!  - hidden_states: device: cuda:1, shape: [1, 1920, 6656], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000
 !!  - attn_mask: device: cuda:1, shape: [1, 1, 1920, 1920], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000
 !!  - layernorm.weight: device: cuda:1, shape: [6656], dtype: float16, min: 0.063477, max: 0.882812, std: 0.020828 eps: 0.00000100
 !!  - self_attn.q_proj: cuda:1 [Q] scales min/max/std: 0.002434/0.054810/0.004154
 !!  - self_attn.k_proj: cuda:1 [Q] scales min/max/std: 0.002108/0.037903/0.003489
 !!  - self_attn.v_proj: cuda:1 [Q] scales min/max/std: 0.005451/0.014809/0.000838
 !!  - self_attn.o_proj: cuda:1 [Q] scales min/max/std: 0.007534/0.059235/0.001205
 !!  - cache device: cuda:1, seq_len: 0
 !! Begin MLP
 !!  - hidden_states: device: cuda:1, shape: [1, 1920, 6656], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000
 !!  - layernorm.weight: device: cuda:1, shape: [6656], dtype: float16, min: 0.045166, max: 0.562500, std: 0.018280 eps: 0.00000100
 !!  - mlp.gate_proj: cuda:1 [Q] scales min/max/std: 0.007130/0.046600/0.001348
 !!  - mlp.up_proj: cuda:1 [Q] scales min/max/std: 0.006557/0.044922/0.001155
 !!  - mlp.down_proj: cuda:1 [Q] scales min/max/std: 0.007763/0.253174/0.003845
 !!  - method: normal
 !! Begin decoder 57
 !! Begin self-attention
 !!  - hidden_states: device: cuda:1, shape: [1, 1920, 6656], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000
 !!  - attn_mask: device: cuda:1, shape: [1, 1, 1920, 1920], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000
 !!  - layernorm.weight: device: cuda:1, shape: [6656], dtype: float16, min: 0.000561, max: 0.945312, std: 0.023651 eps: 0.00000100
 !!  - self_attn.q_proj: cuda:1 [Q] scales min/max/std: 0.002205/0.064087/0.005341
 !!  - self_attn.k_proj: cuda:1 [Q] scales min/max/std: 0.001758/0.045197/0.003632
 !!  - self_attn.v_proj: cuda:1 [Q] scales min/max/std: 0.004539/0.013252/0.001018
 !!  - self_attn.o_proj: cuda:1 [Q] scales min/max/std: 0.007160/0.059235/0.001537
 !!  - cache device: cuda:1, seq_len: 0
 !! Begin MLP
 !!  - hidden_states: device: cuda:1, shape: [1, 1920, 6656], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000
 !!  - layernorm.weight: device: cuda:1, shape: [6656], dtype: float16, min: 0.057129, max: 0.578125, std: 0.018356 eps: 0.00000100
 !!  - mlp.gate_proj: cuda:1 [Q] scales min/max/std: 0.007046/0.050385/0.001329
 !!  - mlp.up_proj: cuda:1 [Q] scales min/max/std: 0.006592/0.057037/0.001279
 !!  - mlp.down_proj: cuda:1 [Q] scales min/max/std: 0.007713/0.225464/0.004005
 !!  - method: normal
 !! Begin decoder 58
 !! Begin self-attention
 !!  - hidden_states: device: cuda:1, shape: [1, 1920, 6656], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000
 !!  - attn_mask: device: cuda:1, shape: [1, 1, 1920, 1920], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000
 !!  - layernorm.weight: device: cuda:1, shape: [6656], dtype: float16, min: 0.018066, max: 0.968750, std: 0.025986 eps: 0.00000100
 !!  - self_attn.q_proj: cuda:1 [Q] scales min/max/std: 0.001770/0.091675/0.005989
 !!  - self_attn.k_proj: cuda:1 [Q] scales min/max/std: 0.002068/0.038147/0.003706
 !!  - self_attn.v_proj: cuda:1 [Q] scales min/max/std: 0.005371/0.024643/0.000950
 !!  - self_attn.o_proj: cuda:1 [Q] scales min/max/std: 0.006836/0.104675/0.001700
 !!  - cache device: cuda:1, seq_len: 0
 !! Begin MLP
 !!  - hidden_states: device: cuda:1, shape: [1, 1920, 6656], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000
 !!  - layernorm.weight: device: cuda:1, shape: [6656], dtype: float16, min: 0.116211, max: 0.498047, std: 0.016815 eps: 0.00000100
 !!  - mlp.gate_proj: cuda:1 [Q] scales min/max/std: 0.006691/0.038422/0.001431
 !!  - mlp.up_proj: cuda:1 [Q] scales min/max/std: 0.006039/0.072632/0.001469
 !!  - mlp.down_proj: cuda:1 [Q] scales min/max/std: 0.007713/0.087219/0.003716
 !!  - method: normal
 !! Begin decoder 59
 !! Begin self-attention
 !!  - hidden_states: device: cuda:1, shape: [1, 1920, 6656], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000
 !!  - attn_mask: device: cuda:1, shape: [1, 1, 1920, 1920], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000
 !!  - layernorm.weight: device: cuda:1, shape: [6656], dtype: float16, min: 0.040283, max: 1.000000, std: 0.027527 eps: 0.00000100
 !!  - self_attn.q_proj: cuda:1 [Q] scales min/max/std: 0.001673/0.081787/0.006100
 !!  - self_attn.k_proj: cuda:1 [Q] scales min/max/std: 0.001400/0.034760/0.003492
 !!  - self_attn.v_proj: cuda:1 [Q] scales min/max/std: 0.003922/0.018036/0.001408
 !!  - self_attn.o_proj: cuda:1 [Q] scales min/max/std: 0.006851/0.092957/0.002872
 !!  - cache device: cuda:1, seq_len: 0
 !! Begin MLP
 !!  - hidden_states: device: cuda:1, shape: [1, 1920, 6656], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000
 !!  - layernorm.weight: device: cuda:1, shape: [6656], dtype: float16, min: 0.148438, max: 0.500000, std: 0.024963 eps: 0.00000100
 !!  - mlp.gate_proj: cuda:1 [Q] scales min/max/std: 0.006203/0.051300/0.001739
 !!  - mlp.up_proj: cuda:1 [Q] scales min/max/std: 0.005501/0.087769/0.002033
 !!  - mlp.down_proj: cuda:1 [Q] scales min/max/std: 0.008026/0.162476/0.005074
 !!  - method: normal
 !! pre norm, hidden_states: device: cuda:1, shape: [1, 1920, 6656], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000
 !! pre lm_head, hidden_states: device: cuda:1, shape: [1, 1, 6656], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000
 !! logits: device: cuda:1, shape: [1, 1, 32000], dtype: float16, min: 0.000000, max: 0.000000, std: 0.000000
 ** Time, Inference: 1.56 seconds
 -- Loading dataset...
 -- Testing..........
 ** Perplexity: nan
 -- Testing.
 ** Perplexity (switched): nan
 -- Testing.
 ** Perplexity (quant_only): nan
Traceback (most recent call last):
  File "/home/john/Projects/Python/GLaDOS/exllama/test_benchmark_inference.py", line 294, in <module>
    text = generator.generate_simple("To be or not to be, that is the", max_new_tokens = 20)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/john/Projects/Python/GLaDOS/exllama/generator.py", line 179, in generate_simple
    token = self.gen_single_token()
            ^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/john/Projects/Python/GLaDOS/exllama/generator.py", line 202, in gen_single_token
    token, _ = self.sample(logits,
               ^^^^^^^^^^^^^^^^^^^
  File "/home/john/Projects/Python/GLaDOS/exllama/generator.py", line 77, in sample
    sampled_ind = torch.multinomial(norm_probs, norm_probs.shape[-1] if num == -1 else min(num, norm_probs.shape[-1]))
                  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
RuntimeError: probability tensor contains either `inf`, `nan` or element < 0