From b5d1e9ffea954f4cf2720bcc0b2541f2a2aa3091 Mon Sep 17 00:00:00 2001 From: ooooo <3164076421@qq.com> Date: Fri, 15 Aug 2025 11:15:36 +0800 Subject: [PATCH] [New Sample] Add Some NLP Computational Graph --- .../graph_hash.txt | 1 + .../graph_net.json | 6 + .../input_meta.py | 0 .../input_tensor_constraints.py | 0 .../model.py | 3094 +++++++++++++ .../weight_meta.py | 3960 +++++++++++++++++ .../Onutoa_1_6e-3_5_0.5/graph_hash.txt | 1 + .../Onutoa_1_6e-3_5_0.5/graph_net.json | 6 + .../Onutoa_1_6e-3_5_0.5/input_meta.py | 0 .../input_tensor_constraints.py | 0 .../Onutoa_1_6e-3_5_0.5/model.py | 3080 +++++++++++++ .../Onutoa_1_6e-3_5_0.5/weight_meta.py | 3949 ++++++++++++++++ .../graph_hash.txt | 1 + .../graph_net.json | 6 + .../input_meta.py | 0 .../input_tensor_constraints.py | 0 .../model.py | 2058 +++++++++ .../weight_meta.py | 2008 +++++++++ .../graph_hash.txt | 1 + .../graph_net.json | 6 + .../input_meta.py | 0 .../input_tensor_constraints.py | 0 .../model.py | 1690 +++++++ .../weight_meta.py | 1038 +++++ .../ashique_BanglaTraitBERT/graph_hash.txt | 1 + .../ashique_BanglaTraitBERT/graph_net.json | 6 + .../ashique_BanglaTraitBERT/input_meta.py | 0 .../input_tensor_constraints.py | 0 .../ashique_BanglaTraitBERT/model.py | 3271 ++++++++++++++ .../ashique_BanglaTraitBERT/weight_meta.py | 3951 ++++++++++++++++ .../graph_hash.txt | 1 + .../graph_net.json | 6 + .../input_meta.py | 0 .../input_tensor_constraints.py | 0 .../model.py | 1818 ++++++++ .../weight_meta.py | 2269 ++++++++++ .../graph_hash.txt | 1 + .../graph_net.json | 6 + .../input_meta.py | 0 .../input_tensor_constraints.py | 0 .../model.py | 1818 ++++++++ .../weight_meta.py | 2287 ++++++++++ .../graph_hash.txt | 1 + .../graph_net.json | 6 + .../input_meta.py | 0 .../input_tensor_constraints.py | 0 .../model.py | 789 ++++ .../weight_meta.py | 1072 +++++ .../graph_hash.txt | 1 + .../graph_net.json | 6 + .../input_meta.py | 0 .../input_tensor_constraints.py | 0 .../model.py | 2301 ++++++++++ .../weight_meta.py | 1168 +++++ .../graph_hash.txt | 1 + .../graph_net.json | 6 + .../input_meta.py | 0 .../input_tensor_constraints.py | 0 .../model.py | 1112 +++++ .../weight_meta.py | 1389 ++++++ .../graph_hash.txt | 1 + .../graph_net.json | 6 + .../input_meta.py | 0 .../input_tensor_constraints.py | 0 .../model.py | 620 +++ .../weight_meta.py | 749 ++++ .../graph_hash.txt | 1 + .../graph_net.json | 6 + .../input_meta.py | 0 .../input_tensor_constraints.py | 0 .../model.py | 620 +++ .../weight_meta.py | 749 ++++ .../graph_hash.txt | 1 + .../graph_net.json | 6 + .../input_meta.py | 0 .../input_tensor_constraints.py | 0 .../ogoshi2000_stance-nystromformer/model.py | 1851 ++++++++ .../weight_meta.py | 2147 +++++++++ .../graph_hash.txt | 1 + .../graph_net.json | 6 + .../input_meta.py | 0 .../input_tensor_constraints.py | 0 .../model.py | 3271 ++++++++++++++ .../weight_meta.py | 3950 ++++++++++++++++ .../graph_hash.txt | 1 + .../graph_net.json | 6 + .../input_meta.py | 0 .../input_tensor_constraints.py | 0 .../model.py | 866 ++++ .../weight_meta.py | 1088 +++++ 90 files changed, 60138 insertions(+) create mode 100644 samples/transformers-auto-model/Data-Lab_ruRoberta-large_classification_v0.2/graph_hash.txt create mode 100644 samples/transformers-auto-model/Data-Lab_ruRoberta-large_classification_v0.2/graph_net.json create mode 100644 samples/transformers-auto-model/Data-Lab_ruRoberta-large_classification_v0.2/input_meta.py create mode 100644 samples/transformers-auto-model/Data-Lab_ruRoberta-large_classification_v0.2/input_tensor_constraints.py create mode 100644 samples/transformers-auto-model/Data-Lab_ruRoberta-large_classification_v0.2/model.py create mode 100644 samples/transformers-auto-model/Data-Lab_ruRoberta-large_classification_v0.2/weight_meta.py create mode 100644 samples/transformers-auto-model/Onutoa_1_6e-3_5_0.5/graph_hash.txt create mode 100644 samples/transformers-auto-model/Onutoa_1_6e-3_5_0.5/graph_net.json create mode 100644 samples/transformers-auto-model/Onutoa_1_6e-3_5_0.5/input_meta.py create mode 100644 samples/transformers-auto-model/Onutoa_1_6e-3_5_0.5/input_tensor_constraints.py create mode 100644 samples/transformers-auto-model/Onutoa_1_6e-3_5_0.5/model.py create mode 100644 samples/transformers-auto-model/Onutoa_1_6e-3_5_0.5/weight_meta.py create mode 100644 samples/transformers-auto-model/VitaliiVrublevskyi_ibert-roberta-base-finetuned-mrpc/graph_hash.txt create mode 100644 samples/transformers-auto-model/VitaliiVrublevskyi_ibert-roberta-base-finetuned-mrpc/graph_net.json create mode 100644 samples/transformers-auto-model/VitaliiVrublevskyi_ibert-roberta-base-finetuned-mrpc/input_meta.py create mode 100644 samples/transformers-auto-model/VitaliiVrublevskyi_ibert-roberta-base-finetuned-mrpc/input_tensor_constraints.py create mode 100644 samples/transformers-auto-model/VitaliiVrublevskyi_ibert-roberta-base-finetuned-mrpc/model.py create mode 100644 samples/transformers-auto-model/VitaliiVrublevskyi_ibert-roberta-base-finetuned-mrpc/weight_meta.py create mode 100644 samples/transformers-auto-model/andersonbcdefg_seo-spam-classifier/graph_hash.txt create mode 100644 samples/transformers-auto-model/andersonbcdefg_seo-spam-classifier/graph_net.json create mode 100644 samples/transformers-auto-model/andersonbcdefg_seo-spam-classifier/input_meta.py create mode 100644 samples/transformers-auto-model/andersonbcdefg_seo-spam-classifier/input_tensor_constraints.py create mode 100644 samples/transformers-auto-model/andersonbcdefg_seo-spam-classifier/model.py create mode 100644 samples/transformers-auto-model/andersonbcdefg_seo-spam-classifier/weight_meta.py create mode 100644 samples/transformers-auto-model/ashique_BanglaTraitBERT/graph_hash.txt create mode 100644 samples/transformers-auto-model/ashique_BanglaTraitBERT/graph_net.json create mode 100644 samples/transformers-auto-model/ashique_BanglaTraitBERT/input_meta.py create mode 100644 samples/transformers-auto-model/ashique_BanglaTraitBERT/input_tensor_constraints.py create mode 100644 samples/transformers-auto-model/ashique_BanglaTraitBERT/model.py create mode 100644 samples/transformers-auto-model/ashique_BanglaTraitBERT/weight_meta.py create mode 100644 samples/transformers-auto-model/fusersam_Sentiment-Analysis-Model/graph_hash.txt create mode 100644 samples/transformers-auto-model/fusersam_Sentiment-Analysis-Model/graph_net.json create mode 100644 samples/transformers-auto-model/fusersam_Sentiment-Analysis-Model/input_meta.py create mode 100644 samples/transformers-auto-model/fusersam_Sentiment-Analysis-Model/input_tensor_constraints.py create mode 100644 samples/transformers-auto-model/fusersam_Sentiment-Analysis-Model/model.py create mode 100644 samples/transformers-auto-model/fusersam_Sentiment-Analysis-Model/weight_meta.py create mode 100644 samples/transformers-auto-model/gonchisi_roberta-base-bne-finetuned-new_or_used-title/graph_hash.txt create mode 100644 samples/transformers-auto-model/gonchisi_roberta-base-bne-finetuned-new_or_used-title/graph_net.json create mode 100644 samples/transformers-auto-model/gonchisi_roberta-base-bne-finetuned-new_or_used-title/input_meta.py create mode 100644 samples/transformers-auto-model/gonchisi_roberta-base-bne-finetuned-new_or_used-title/input_tensor_constraints.py create mode 100644 samples/transformers-auto-model/gonchisi_roberta-base-bne-finetuned-new_or_used-title/model.py create mode 100644 samples/transformers-auto-model/gonchisi_roberta-base-bne-finetuned-new_or_used-title/weight_meta.py create mode 100644 samples/transformers-auto-model/hossamamer12_BUS15100_MB2_20epoch_notweettokenizer_fp16/graph_hash.txt create mode 100644 samples/transformers-auto-model/hossamamer12_BUS15100_MB2_20epoch_notweettokenizer_fp16/graph_net.json create mode 100644 samples/transformers-auto-model/hossamamer12_BUS15100_MB2_20epoch_notweettokenizer_fp16/input_meta.py create mode 100644 samples/transformers-auto-model/hossamamer12_BUS15100_MB2_20epoch_notweettokenizer_fp16/input_tensor_constraints.py create mode 100644 samples/transformers-auto-model/hossamamer12_BUS15100_MB2_20epoch_notweettokenizer_fp16/model.py create mode 100644 samples/transformers-auto-model/hossamamer12_BUS15100_MB2_20epoch_notweettokenizer_fp16/weight_meta.py create mode 100644 samples/transformers-auto-model/moska_plt5-seq-clf-with-entities-updated-finetuned/graph_hash.txt create mode 100644 samples/transformers-auto-model/moska_plt5-seq-clf-with-entities-updated-finetuned/graph_net.json create mode 100644 samples/transformers-auto-model/moska_plt5-seq-clf-with-entities-updated-finetuned/input_meta.py create mode 100644 samples/transformers-auto-model/moska_plt5-seq-clf-with-entities-updated-finetuned/input_tensor_constraints.py create mode 100644 samples/transformers-auto-model/moska_plt5-seq-clf-with-entities-updated-finetuned/model.py create mode 100644 samples/transformers-auto-model/moska_plt5-seq-clf-with-entities-updated-finetuned/weight_meta.py create mode 100644 samples/transformers-auto-model/muhtasham_medium-vanilla-target-tweet/graph_hash.txt create mode 100644 samples/transformers-auto-model/muhtasham_medium-vanilla-target-tweet/graph_net.json create mode 100644 samples/transformers-auto-model/muhtasham_medium-vanilla-target-tweet/input_meta.py create mode 100644 samples/transformers-auto-model/muhtasham_medium-vanilla-target-tweet/input_tensor_constraints.py create mode 100644 samples/transformers-auto-model/muhtasham_medium-vanilla-target-tweet/model.py create mode 100644 samples/transformers-auto-model/muhtasham_medium-vanilla-target-tweet/weight_meta.py create mode 100644 samples/transformers-auto-model/muhtasham_mini-vanilla-target-tweet/graph_hash.txt create mode 100644 samples/transformers-auto-model/muhtasham_mini-vanilla-target-tweet/graph_net.json create mode 100644 samples/transformers-auto-model/muhtasham_mini-vanilla-target-tweet/input_meta.py create mode 100644 samples/transformers-auto-model/muhtasham_mini-vanilla-target-tweet/input_tensor_constraints.py create mode 100644 samples/transformers-auto-model/muhtasham_mini-vanilla-target-tweet/model.py create mode 100644 samples/transformers-auto-model/muhtasham_mini-vanilla-target-tweet/weight_meta.py create mode 100644 samples/transformers-auto-model/muhtasham_small-vanilla-target-tweet/graph_hash.txt create mode 100644 samples/transformers-auto-model/muhtasham_small-vanilla-target-tweet/graph_net.json create mode 100644 samples/transformers-auto-model/muhtasham_small-vanilla-target-tweet/input_meta.py create mode 100644 samples/transformers-auto-model/muhtasham_small-vanilla-target-tweet/input_tensor_constraints.py create mode 100644 samples/transformers-auto-model/muhtasham_small-vanilla-target-tweet/model.py create mode 100644 samples/transformers-auto-model/muhtasham_small-vanilla-target-tweet/weight_meta.py create mode 100644 samples/transformers-auto-model/ogoshi2000_stance-nystromformer/graph_hash.txt create mode 100644 samples/transformers-auto-model/ogoshi2000_stance-nystromformer/graph_net.json create mode 100644 samples/transformers-auto-model/ogoshi2000_stance-nystromformer/input_meta.py create mode 100644 samples/transformers-auto-model/ogoshi2000_stance-nystromformer/input_tensor_constraints.py create mode 100644 samples/transformers-auto-model/ogoshi2000_stance-nystromformer/model.py create mode 100644 samples/transformers-auto-model/ogoshi2000_stance-nystromformer/weight_meta.py create mode 100644 samples/transformers-auto-model/paulkm_autotrain-lottery_v2-2420075389/graph_hash.txt create mode 100644 samples/transformers-auto-model/paulkm_autotrain-lottery_v2-2420075389/graph_net.json create mode 100644 samples/transformers-auto-model/paulkm_autotrain-lottery_v2-2420075389/input_meta.py create mode 100644 samples/transformers-auto-model/paulkm_autotrain-lottery_v2-2420075389/input_tensor_constraints.py create mode 100644 samples/transformers-auto-model/paulkm_autotrain-lottery_v2-2420075389/model.py create mode 100644 samples/transformers-auto-model/paulkm_autotrain-lottery_v2-2420075389/weight_meta.py create mode 100644 samples/transformers-auto-model/xc2450_distilbert-portuguese-cased-finetuned-bec_classification/graph_hash.txt create mode 100644 samples/transformers-auto-model/xc2450_distilbert-portuguese-cased-finetuned-bec_classification/graph_net.json create mode 100644 samples/transformers-auto-model/xc2450_distilbert-portuguese-cased-finetuned-bec_classification/input_meta.py create mode 100644 samples/transformers-auto-model/xc2450_distilbert-portuguese-cased-finetuned-bec_classification/input_tensor_constraints.py create mode 100644 samples/transformers-auto-model/xc2450_distilbert-portuguese-cased-finetuned-bec_classification/model.py create mode 100644 samples/transformers-auto-model/xc2450_distilbert-portuguese-cased-finetuned-bec_classification/weight_meta.py diff --git a/samples/transformers-auto-model/Data-Lab_ruRoberta-large_classification_v0.2/graph_hash.txt b/samples/transformers-auto-model/Data-Lab_ruRoberta-large_classification_v0.2/graph_hash.txt new file mode 100644 index 000000000..ffa427b9b --- /dev/null +++ b/samples/transformers-auto-model/Data-Lab_ruRoberta-large_classification_v0.2/graph_hash.txt @@ -0,0 +1 @@ +f7bcd8ac8a5673e44a86a11d2130ea45ed19abc85d9e3cd2c871eb3a78b62451 \ No newline at end of file diff --git a/samples/transformers-auto-model/Data-Lab_ruRoberta-large_classification_v0.2/graph_net.json b/samples/transformers-auto-model/Data-Lab_ruRoberta-large_classification_v0.2/graph_net.json new file mode 100644 index 000000000..b6ffe9f72 --- /dev/null +++ b/samples/transformers-auto-model/Data-Lab_ruRoberta-large_classification_v0.2/graph_net.json @@ -0,0 +1,6 @@ +{ + "framework": "torch", + "num_devices_required": 1, + "num_nodes_required": 1, + "dynamic": false +} \ No newline at end of file diff --git a/samples/transformers-auto-model/Data-Lab_ruRoberta-large_classification_v0.2/input_meta.py b/samples/transformers-auto-model/Data-Lab_ruRoberta-large_classification_v0.2/input_meta.py new file mode 100644 index 000000000..e69de29bb diff --git a/samples/transformers-auto-model/Data-Lab_ruRoberta-large_classification_v0.2/input_tensor_constraints.py b/samples/transformers-auto-model/Data-Lab_ruRoberta-large_classification_v0.2/input_tensor_constraints.py new file mode 100644 index 000000000..e69de29bb diff --git a/samples/transformers-auto-model/Data-Lab_ruRoberta-large_classification_v0.2/model.py b/samples/transformers-auto-model/Data-Lab_ruRoberta-large_classification_v0.2/model.py new file mode 100644 index 000000000..405eaa534 --- /dev/null +++ b/samples/transformers-auto-model/Data-Lab_ruRoberta-large_classification_v0.2/model.py @@ -0,0 +1,3094 @@ +import torch + + +class GraphModule(torch.nn.Module): + def forward( + self, + L_input_ids_: torch.Tensor, + L_self_modules_embeddings_buffers_token_type_ids_: torch.Tensor, + L_self_modules_embeddings_modules_word_embeddings_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_embeddings_modules_token_type_embeddings_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_embeddings_modules_position_embeddings_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_embeddings_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_embeddings_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_attention_mask_: torch.Tensor, + L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_query_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_query_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_key_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_key_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_intermediate_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_intermediate_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_query_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_query_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_key_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_key_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_intermediate_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_intermediate_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_query_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_query_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_key_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_key_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_intermediate_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_intermediate_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_query_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_query_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_key_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_key_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_intermediate_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_intermediate_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_query_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_query_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_key_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_key_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_4_modules_intermediate_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_4_modules_intermediate_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_4_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_4_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_4_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_4_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_query_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_query_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_key_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_key_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_5_modules_intermediate_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_5_modules_intermediate_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_5_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_5_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_5_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_5_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_query_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_query_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_key_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_key_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_6_modules_intermediate_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_6_modules_intermediate_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_6_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_6_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_6_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_6_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_query_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_query_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_key_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_key_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_7_modules_intermediate_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_7_modules_intermediate_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_7_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_7_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_7_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_7_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_query_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_query_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_key_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_key_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_8_modules_intermediate_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_8_modules_intermediate_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_8_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_8_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_8_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_8_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_query_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_query_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_key_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_key_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_9_modules_intermediate_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_9_modules_intermediate_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_9_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_9_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_9_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_9_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_query_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_query_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_key_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_key_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_10_modules_intermediate_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_10_modules_intermediate_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_10_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_10_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_10_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_10_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_query_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_query_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_key_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_key_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_11_modules_intermediate_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_11_modules_intermediate_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_11_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_11_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_11_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_11_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_self_modules_query_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_self_modules_query_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_self_modules_key_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_self_modules_key_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_self_modules_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_self_modules_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_12_modules_intermediate_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_12_modules_intermediate_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_12_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_12_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_12_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_12_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_self_modules_query_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_self_modules_query_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_self_modules_key_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_self_modules_key_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_self_modules_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_self_modules_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_13_modules_intermediate_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_13_modules_intermediate_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_13_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_13_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_13_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_13_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_self_modules_query_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_self_modules_query_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_self_modules_key_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_self_modules_key_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_self_modules_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_self_modules_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_14_modules_intermediate_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_14_modules_intermediate_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_14_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_14_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_14_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_14_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_self_modules_query_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_self_modules_query_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_self_modules_key_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_self_modules_key_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_self_modules_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_self_modules_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_15_modules_intermediate_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_15_modules_intermediate_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_15_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_15_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_15_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_15_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_self_modules_query_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_self_modules_query_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_self_modules_key_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_self_modules_key_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_self_modules_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_self_modules_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_16_modules_intermediate_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_16_modules_intermediate_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_16_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_16_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_16_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_16_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_self_modules_query_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_self_modules_query_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_self_modules_key_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_self_modules_key_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_self_modules_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_self_modules_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_17_modules_intermediate_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_17_modules_intermediate_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_17_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_17_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_17_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_17_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_self_modules_query_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_self_modules_query_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_self_modules_key_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_self_modules_key_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_self_modules_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_self_modules_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_18_modules_intermediate_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_18_modules_intermediate_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_18_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_18_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_18_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_18_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_self_modules_query_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_self_modules_query_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_self_modules_key_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_self_modules_key_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_self_modules_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_self_modules_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_19_modules_intermediate_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_19_modules_intermediate_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_19_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_19_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_19_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_19_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_self_modules_query_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_self_modules_query_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_self_modules_key_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_self_modules_key_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_self_modules_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_self_modules_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_20_modules_intermediate_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_20_modules_intermediate_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_20_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_20_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_20_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_20_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_self_modules_query_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_self_modules_query_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_self_modules_key_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_self_modules_key_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_self_modules_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_self_modules_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_21_modules_intermediate_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_21_modules_intermediate_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_21_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_21_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_21_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_21_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_self_modules_query_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_self_modules_query_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_self_modules_key_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_self_modules_key_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_self_modules_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_self_modules_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_22_modules_intermediate_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_22_modules_intermediate_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_22_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_22_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_22_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_22_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_self_modules_query_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_self_modules_query_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_self_modules_key_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_self_modules_key_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_self_modules_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_self_modules_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_23_modules_intermediate_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_23_modules_intermediate_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_23_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_23_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_23_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_23_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_pooler_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_pooler_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + ): + l_input_ids_ = L_input_ids_ + l_self_modules_embeddings_buffers_token_type_ids_ = ( + L_self_modules_embeddings_buffers_token_type_ids_ + ) + l_self_modules_embeddings_modules_word_embeddings_parameters_weight_ = ( + L_self_modules_embeddings_modules_word_embeddings_parameters_weight_ + ) + l_self_modules_embeddings_modules_token_type_embeddings_parameters_weight_ = ( + L_self_modules_embeddings_modules_token_type_embeddings_parameters_weight_ + ) + l_self_modules_embeddings_modules_position_embeddings_parameters_weight_ = ( + L_self_modules_embeddings_modules_position_embeddings_parameters_weight_ + ) + l_self_modules_embeddings_modules_layer_norm_parameters_weight_ = ( + L_self_modules_embeddings_modules_LayerNorm_parameters_weight_ + ) + l_self_modules_embeddings_modules_layer_norm_parameters_bias_ = ( + L_self_modules_embeddings_modules_LayerNorm_parameters_bias_ + ) + l_attention_mask_ = L_attention_mask_ + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_query_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_query_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_query_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_query_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_key_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_key_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_key_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_key_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_value_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_value_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_value_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_value_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_0_modules_intermediate_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_0_modules_intermediate_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_0_modules_intermediate_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_0_modules_intermediate_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_0_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_0_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_query_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_query_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_query_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_query_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_key_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_key_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_key_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_key_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_value_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_value_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_value_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_value_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_1_modules_intermediate_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_1_modules_intermediate_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_1_modules_intermediate_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_1_modules_intermediate_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_1_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_1_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_query_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_query_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_query_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_query_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_key_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_key_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_key_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_key_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_value_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_value_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_value_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_value_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_2_modules_intermediate_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_2_modules_intermediate_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_2_modules_intermediate_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_2_modules_intermediate_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_2_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_2_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_2_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_2_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_query_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_query_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_query_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_query_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_key_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_key_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_key_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_key_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_value_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_value_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_value_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_value_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_3_modules_intermediate_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_3_modules_intermediate_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_3_modules_intermediate_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_3_modules_intermediate_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_3_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_3_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_3_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_3_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_query_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_query_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_query_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_query_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_key_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_key_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_key_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_key_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_value_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_value_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_value_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_value_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_4_modules_intermediate_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_4_modules_intermediate_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_4_modules_intermediate_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_4_modules_intermediate_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_4_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_4_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_4_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_4_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_4_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_4_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_4_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_4_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_query_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_query_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_query_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_query_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_key_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_key_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_key_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_key_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_value_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_value_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_value_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_value_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_5_modules_intermediate_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_5_modules_intermediate_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_5_modules_intermediate_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_5_modules_intermediate_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_5_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_5_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_5_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_5_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_5_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_5_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_5_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_5_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_query_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_query_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_query_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_query_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_key_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_key_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_key_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_key_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_value_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_value_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_value_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_value_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_6_modules_intermediate_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_6_modules_intermediate_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_6_modules_intermediate_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_6_modules_intermediate_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_6_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_6_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_6_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_6_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_6_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_6_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_6_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_6_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_query_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_query_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_query_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_query_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_key_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_key_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_key_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_key_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_value_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_value_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_value_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_value_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_7_modules_intermediate_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_7_modules_intermediate_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_7_modules_intermediate_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_7_modules_intermediate_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_7_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_7_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_7_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_7_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_7_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_7_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_7_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_7_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_query_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_query_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_query_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_query_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_key_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_key_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_key_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_key_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_value_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_value_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_value_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_value_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_8_modules_intermediate_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_8_modules_intermediate_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_8_modules_intermediate_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_8_modules_intermediate_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_8_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_8_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_8_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_8_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_8_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_8_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_8_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_8_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_query_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_query_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_query_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_query_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_key_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_key_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_key_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_key_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_value_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_value_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_value_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_value_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_9_modules_intermediate_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_9_modules_intermediate_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_9_modules_intermediate_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_9_modules_intermediate_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_9_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_9_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_9_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_9_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_9_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_9_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_9_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_9_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_query_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_query_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_query_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_query_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_key_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_key_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_key_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_key_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_value_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_value_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_value_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_value_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_10_modules_intermediate_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_10_modules_intermediate_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_10_modules_intermediate_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_10_modules_intermediate_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_10_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_10_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_10_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_10_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_10_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_10_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_10_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_10_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_query_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_query_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_query_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_query_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_key_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_key_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_key_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_key_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_value_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_value_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_value_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_value_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_11_modules_intermediate_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_11_modules_intermediate_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_11_modules_intermediate_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_11_modules_intermediate_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_11_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_11_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_11_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_11_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_11_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_11_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_11_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_11_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_self_modules_query_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_self_modules_query_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_self_modules_query_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_self_modules_query_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_self_modules_key_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_self_modules_key_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_self_modules_key_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_self_modules_key_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_self_modules_value_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_self_modules_value_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_self_modules_value_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_self_modules_value_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_12_modules_intermediate_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_12_modules_intermediate_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_12_modules_intermediate_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_12_modules_intermediate_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_12_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_12_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_12_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_12_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_12_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_12_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_12_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_12_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_self_modules_query_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_self_modules_query_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_self_modules_query_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_self_modules_query_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_self_modules_key_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_self_modules_key_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_self_modules_key_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_self_modules_key_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_self_modules_value_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_self_modules_value_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_self_modules_value_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_self_modules_value_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_13_modules_intermediate_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_13_modules_intermediate_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_13_modules_intermediate_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_13_modules_intermediate_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_13_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_13_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_13_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_13_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_13_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_13_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_13_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_13_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_self_modules_query_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_self_modules_query_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_self_modules_query_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_self_modules_query_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_self_modules_key_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_self_modules_key_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_self_modules_key_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_self_modules_key_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_self_modules_value_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_self_modules_value_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_self_modules_value_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_self_modules_value_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_14_modules_intermediate_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_14_modules_intermediate_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_14_modules_intermediate_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_14_modules_intermediate_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_14_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_14_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_14_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_14_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_14_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_14_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_14_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_14_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_self_modules_query_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_self_modules_query_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_self_modules_query_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_self_modules_query_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_self_modules_key_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_self_modules_key_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_self_modules_key_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_self_modules_key_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_self_modules_value_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_self_modules_value_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_self_modules_value_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_self_modules_value_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_15_modules_intermediate_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_15_modules_intermediate_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_15_modules_intermediate_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_15_modules_intermediate_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_15_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_15_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_15_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_15_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_15_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_15_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_15_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_15_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_self_modules_query_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_self_modules_query_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_self_modules_query_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_self_modules_query_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_self_modules_key_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_self_modules_key_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_self_modules_key_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_self_modules_key_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_self_modules_value_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_self_modules_value_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_self_modules_value_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_self_modules_value_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_16_modules_intermediate_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_16_modules_intermediate_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_16_modules_intermediate_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_16_modules_intermediate_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_16_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_16_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_16_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_16_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_16_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_16_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_16_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_16_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_self_modules_query_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_self_modules_query_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_self_modules_query_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_self_modules_query_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_self_modules_key_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_self_modules_key_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_self_modules_key_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_self_modules_key_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_self_modules_value_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_self_modules_value_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_self_modules_value_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_self_modules_value_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_17_modules_intermediate_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_17_modules_intermediate_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_17_modules_intermediate_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_17_modules_intermediate_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_17_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_17_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_17_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_17_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_17_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_17_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_17_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_17_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_self_modules_query_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_self_modules_query_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_self_modules_query_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_self_modules_query_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_self_modules_key_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_self_modules_key_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_self_modules_key_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_self_modules_key_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_self_modules_value_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_self_modules_value_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_self_modules_value_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_self_modules_value_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_18_modules_intermediate_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_18_modules_intermediate_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_18_modules_intermediate_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_18_modules_intermediate_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_18_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_18_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_18_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_18_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_18_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_18_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_18_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_18_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_self_modules_query_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_self_modules_query_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_self_modules_query_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_self_modules_query_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_self_modules_key_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_self_modules_key_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_self_modules_key_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_self_modules_key_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_self_modules_value_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_self_modules_value_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_self_modules_value_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_self_modules_value_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_19_modules_intermediate_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_19_modules_intermediate_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_19_modules_intermediate_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_19_modules_intermediate_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_19_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_19_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_19_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_19_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_19_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_19_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_19_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_19_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_self_modules_query_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_self_modules_query_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_self_modules_query_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_self_modules_query_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_self_modules_key_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_self_modules_key_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_self_modules_key_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_self_modules_key_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_self_modules_value_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_self_modules_value_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_self_modules_value_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_self_modules_value_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_20_modules_intermediate_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_20_modules_intermediate_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_20_modules_intermediate_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_20_modules_intermediate_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_20_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_20_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_20_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_20_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_20_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_20_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_20_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_20_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_self_modules_query_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_self_modules_query_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_self_modules_query_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_self_modules_query_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_self_modules_key_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_self_modules_key_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_self_modules_key_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_self_modules_key_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_self_modules_value_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_self_modules_value_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_self_modules_value_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_self_modules_value_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_21_modules_intermediate_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_21_modules_intermediate_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_21_modules_intermediate_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_21_modules_intermediate_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_21_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_21_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_21_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_21_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_21_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_21_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_21_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_21_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_self_modules_query_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_self_modules_query_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_self_modules_query_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_self_modules_query_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_self_modules_key_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_self_modules_key_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_self_modules_key_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_self_modules_key_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_self_modules_value_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_self_modules_value_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_self_modules_value_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_self_modules_value_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_22_modules_intermediate_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_22_modules_intermediate_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_22_modules_intermediate_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_22_modules_intermediate_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_22_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_22_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_22_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_22_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_22_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_22_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_22_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_22_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_self_modules_query_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_self_modules_query_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_self_modules_query_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_self_modules_query_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_self_modules_key_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_self_modules_key_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_self_modules_key_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_self_modules_key_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_self_modules_value_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_self_modules_value_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_self_modules_value_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_self_modules_value_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_23_modules_intermediate_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_23_modules_intermediate_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_23_modules_intermediate_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_23_modules_intermediate_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_23_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_23_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_23_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_23_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_23_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_23_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_23_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_23_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_pooler_modules_dense_parameters_weight_ = ( + L_self_modules_pooler_modules_dense_parameters_weight_ + ) + l_self_modules_pooler_modules_dense_parameters_bias_ = ( + L_self_modules_pooler_modules_dense_parameters_bias_ + ) + buffered_token_type_ids = l_self_modules_embeddings_buffers_token_type_ids_[ + (slice(None, None, None), slice(None, 20, None)) + ] + l_self_modules_embeddings_buffers_token_type_ids_ = None + buffered_token_type_ids_expanded = buffered_token_type_ids.expand(1, 20) + buffered_token_type_ids = None + ne = l_input_ids_.ne(1) + mask = ne.int() + ne = None + cumsum = torch.cumsum(mask, dim=1) + type_as = cumsum.type_as(mask) + cumsum = None + add = type_as + 0 + type_as = None + incremental_indices = add * mask + add = mask = None + long = incremental_indices.long() + incremental_indices = None + position_ids = long + 1 + long = None + inputs_embeds = torch.nn.functional.embedding( + l_input_ids_, + l_self_modules_embeddings_modules_word_embeddings_parameters_weight_, + 1, + None, + 2.0, + False, + False, + ) + l_input_ids_ = ( + l_self_modules_embeddings_modules_word_embeddings_parameters_weight_ + ) = None + token_type_embeddings = torch.nn.functional.embedding( + buffered_token_type_ids_expanded, + l_self_modules_embeddings_modules_token_type_embeddings_parameters_weight_, + None, + None, + 2.0, + False, + False, + ) + buffered_token_type_ids_expanded = ( + l_self_modules_embeddings_modules_token_type_embeddings_parameters_weight_ + ) = None + embeddings = inputs_embeds + token_type_embeddings + inputs_embeds = token_type_embeddings = None + position_embeddings = torch.nn.functional.embedding( + position_ids, + l_self_modules_embeddings_modules_position_embeddings_parameters_weight_, + 1, + None, + 2.0, + False, + False, + ) + position_ids = ( + l_self_modules_embeddings_modules_position_embeddings_parameters_weight_ + ) = None + embeddings += position_embeddings + embeddings_1 = embeddings + embeddings = position_embeddings = None + embeddings_2 = torch.nn.functional.layer_norm( + embeddings_1, + (1024,), + l_self_modules_embeddings_modules_layer_norm_parameters_weight_, + l_self_modules_embeddings_modules_layer_norm_parameters_bias_, + 1e-05, + ) + embeddings_1 = ( + l_self_modules_embeddings_modules_layer_norm_parameters_weight_ + ) = l_self_modules_embeddings_modules_layer_norm_parameters_bias_ = None + embeddings_3 = torch.nn.functional.dropout(embeddings_2, 0.1, False, False) + embeddings_2 = None + getitem_1 = l_attention_mask_[ + (slice(None, None, None), None, None, slice(None, None, None)) + ] + l_attention_mask_ = None + expand_1 = getitem_1.expand(1, 1, 20, 20) + getitem_1 = None + expanded_mask = expand_1.to(torch.float32) + expand_1 = None + tensor = torch.tensor(1.0, dtype=torch.float32) + inverted_mask = tensor - expanded_mask + tensor = expanded_mask = None + to_1 = inverted_mask.to(torch.bool) + extended_attention_mask = inverted_mask.masked_fill( + to_1, -3.4028234663852886e38 + ) + inverted_mask = to_1 = None + linear = torch._C._nn.linear( + embeddings_3, + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_query_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_query_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_query_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_query_parameters_bias_ = (None) + view = linear.view(1, -1, 16, 64) + linear = None + query_layer = view.transpose(1, 2) + view = None + linear_1 = torch._C._nn.linear( + embeddings_3, + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_key_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_key_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_key_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_key_parameters_bias_ = (None) + view_1 = linear_1.view(1, -1, 16, 64) + linear_1 = None + key_layer = view_1.transpose(1, 2) + view_1 = None + linear_2 = torch._C._nn.linear( + embeddings_3, + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_value_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_value_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_value_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_value_parameters_bias_ = (None) + view_2 = linear_2.view(1, -1, 16, 64) + linear_2 = None + value_layer = view_2.transpose(1, 2) + view_2 = None + attn_output = torch._C._nn.scaled_dot_product_attention( + query_layer, + key_layer, + value_layer, + attn_mask=extended_attention_mask, + dropout_p=0.0, + is_causal=False, + ) + query_layer = key_layer = value_layer = None + attn_output_1 = attn_output.transpose(1, 2) + attn_output = None + attn_output_2 = attn_output_1.reshape(1, 20, 1024) + attn_output_1 = None + hidden_states = torch._C._nn.linear( + attn_output_2, + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dense_parameters_bias_, + ) + attn_output_2 = l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_1 = torch.nn.functional.dropout(hidden_states, 0.1, False, False) + hidden_states = None + add_3 = hidden_states_1 + embeddings_3 + hidden_states_1 = embeddings_3 = None + hidden_states_2 = torch.nn.functional.layer_norm( + add_3, + (1024,), + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_layer_norm_parameters_bias_, + 1e-05, + ) + add_3 = l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = (None) + hidden_states_3 = torch._C._nn.linear( + hidden_states_2, + l_self_modules_encoder_modules_layer_modules_0_modules_intermediate_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_0_modules_intermediate_modules_dense_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_0_modules_intermediate_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_0_modules_intermediate_modules_dense_parameters_bias_ = (None) + hidden_states_4 = torch._C._nn.gelu(hidden_states_3) + hidden_states_3 = None + hidden_states_5 = torch._C._nn.linear( + hidden_states_4, + l_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dense_parameters_bias_, + ) + hidden_states_4 = l_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_6 = torch.nn.functional.dropout( + hidden_states_5, 0.1, False, False + ) + hidden_states_5 = None + add_4 = hidden_states_6 + hidden_states_2 + hidden_states_6 = hidden_states_2 = None + hidden_states_7 = torch.nn.functional.layer_norm( + add_4, + (1024,), + l_self_modules_encoder_modules_layer_modules_0_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_0_modules_output_modules_layer_norm_parameters_bias_, + 1e-05, + ) + add_4 = l_self_modules_encoder_modules_layer_modules_0_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_0_modules_output_modules_layer_norm_parameters_bias_ = (None) + linear_6 = torch._C._nn.linear( + hidden_states_7, + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_query_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_query_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_query_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_query_parameters_bias_ = (None) + view_3 = linear_6.view(1, -1, 16, 64) + linear_6 = None + query_layer_1 = view_3.transpose(1, 2) + view_3 = None + linear_7 = torch._C._nn.linear( + hidden_states_7, + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_key_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_key_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_key_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_key_parameters_bias_ = (None) + view_4 = linear_7.view(1, -1, 16, 64) + linear_7 = None + key_layer_1 = view_4.transpose(1, 2) + view_4 = None + linear_8 = torch._C._nn.linear( + hidden_states_7, + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_value_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_value_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_value_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_value_parameters_bias_ = (None) + view_5 = linear_8.view(1, -1, 16, 64) + linear_8 = None + value_layer_1 = view_5.transpose(1, 2) + view_5 = None + attn_output_3 = torch._C._nn.scaled_dot_product_attention( + query_layer_1, + key_layer_1, + value_layer_1, + attn_mask=extended_attention_mask, + dropout_p=0.0, + is_causal=False, + ) + query_layer_1 = key_layer_1 = value_layer_1 = None + attn_output_4 = attn_output_3.transpose(1, 2) + attn_output_3 = None + attn_output_5 = attn_output_4.reshape(1, 20, 1024) + attn_output_4 = None + hidden_states_8 = torch._C._nn.linear( + attn_output_5, + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dense_parameters_bias_, + ) + attn_output_5 = l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_9 = torch.nn.functional.dropout( + hidden_states_8, 0.1, False, False + ) + hidden_states_8 = None + add_5 = hidden_states_9 + hidden_states_7 + hidden_states_9 = hidden_states_7 = None + hidden_states_10 = torch.nn.functional.layer_norm( + add_5, + (1024,), + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_layer_norm_parameters_bias_, + 1e-05, + ) + add_5 = l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = (None) + hidden_states_11 = torch._C._nn.linear( + hidden_states_10, + l_self_modules_encoder_modules_layer_modules_1_modules_intermediate_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_1_modules_intermediate_modules_dense_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_1_modules_intermediate_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_1_modules_intermediate_modules_dense_parameters_bias_ = (None) + hidden_states_12 = torch._C._nn.gelu(hidden_states_11) + hidden_states_11 = None + hidden_states_13 = torch._C._nn.linear( + hidden_states_12, + l_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dense_parameters_bias_, + ) + hidden_states_12 = l_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_14 = torch.nn.functional.dropout( + hidden_states_13, 0.1, False, False + ) + hidden_states_13 = None + add_6 = hidden_states_14 + hidden_states_10 + hidden_states_14 = hidden_states_10 = None + hidden_states_15 = torch.nn.functional.layer_norm( + add_6, + (1024,), + l_self_modules_encoder_modules_layer_modules_1_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_1_modules_output_modules_layer_norm_parameters_bias_, + 1e-05, + ) + add_6 = l_self_modules_encoder_modules_layer_modules_1_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_1_modules_output_modules_layer_norm_parameters_bias_ = (None) + linear_12 = torch._C._nn.linear( + hidden_states_15, + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_query_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_query_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_query_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_query_parameters_bias_ = (None) + view_6 = linear_12.view(1, -1, 16, 64) + linear_12 = None + query_layer_2 = view_6.transpose(1, 2) + view_6 = None + linear_13 = torch._C._nn.linear( + hidden_states_15, + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_key_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_key_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_key_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_key_parameters_bias_ = (None) + view_7 = linear_13.view(1, -1, 16, 64) + linear_13 = None + key_layer_2 = view_7.transpose(1, 2) + view_7 = None + linear_14 = torch._C._nn.linear( + hidden_states_15, + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_value_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_value_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_value_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_value_parameters_bias_ = (None) + view_8 = linear_14.view(1, -1, 16, 64) + linear_14 = None + value_layer_2 = view_8.transpose(1, 2) + view_8 = None + attn_output_6 = torch._C._nn.scaled_dot_product_attention( + query_layer_2, + key_layer_2, + value_layer_2, + attn_mask=extended_attention_mask, + dropout_p=0.0, + is_causal=False, + ) + query_layer_2 = key_layer_2 = value_layer_2 = None + attn_output_7 = attn_output_6.transpose(1, 2) + attn_output_6 = None + attn_output_8 = attn_output_7.reshape(1, 20, 1024) + attn_output_7 = None + hidden_states_16 = torch._C._nn.linear( + attn_output_8, + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_dense_parameters_bias_, + ) + attn_output_8 = l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_17 = torch.nn.functional.dropout( + hidden_states_16, 0.1, False, False + ) + hidden_states_16 = None + add_7 = hidden_states_17 + hidden_states_15 + hidden_states_17 = hidden_states_15 = None + hidden_states_18 = torch.nn.functional.layer_norm( + add_7, + (1024,), + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_layer_norm_parameters_bias_, + 1e-05, + ) + add_7 = l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = (None) + hidden_states_19 = torch._C._nn.linear( + hidden_states_18, + l_self_modules_encoder_modules_layer_modules_2_modules_intermediate_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_2_modules_intermediate_modules_dense_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_2_modules_intermediate_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_2_modules_intermediate_modules_dense_parameters_bias_ = (None) + hidden_states_20 = torch._C._nn.gelu(hidden_states_19) + hidden_states_19 = None + hidden_states_21 = torch._C._nn.linear( + hidden_states_20, + l_self_modules_encoder_modules_layer_modules_2_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_2_modules_output_modules_dense_parameters_bias_, + ) + hidden_states_20 = l_self_modules_encoder_modules_layer_modules_2_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_2_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_22 = torch.nn.functional.dropout( + hidden_states_21, 0.1, False, False + ) + hidden_states_21 = None + add_8 = hidden_states_22 + hidden_states_18 + hidden_states_22 = hidden_states_18 = None + hidden_states_23 = torch.nn.functional.layer_norm( + add_8, + (1024,), + l_self_modules_encoder_modules_layer_modules_2_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_2_modules_output_modules_layer_norm_parameters_bias_, + 1e-05, + ) + add_8 = l_self_modules_encoder_modules_layer_modules_2_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_2_modules_output_modules_layer_norm_parameters_bias_ = (None) + linear_18 = torch._C._nn.linear( + hidden_states_23, + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_query_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_query_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_query_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_query_parameters_bias_ = (None) + view_9 = linear_18.view(1, -1, 16, 64) + linear_18 = None + query_layer_3 = view_9.transpose(1, 2) + view_9 = None + linear_19 = torch._C._nn.linear( + hidden_states_23, + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_key_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_key_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_key_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_key_parameters_bias_ = (None) + view_10 = linear_19.view(1, -1, 16, 64) + linear_19 = None + key_layer_3 = view_10.transpose(1, 2) + view_10 = None + linear_20 = torch._C._nn.linear( + hidden_states_23, + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_value_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_value_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_value_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_value_parameters_bias_ = (None) + view_11 = linear_20.view(1, -1, 16, 64) + linear_20 = None + value_layer_3 = view_11.transpose(1, 2) + view_11 = None + attn_output_9 = torch._C._nn.scaled_dot_product_attention( + query_layer_3, + key_layer_3, + value_layer_3, + attn_mask=extended_attention_mask, + dropout_p=0.0, + is_causal=False, + ) + query_layer_3 = key_layer_3 = value_layer_3 = None + attn_output_10 = attn_output_9.transpose(1, 2) + attn_output_9 = None + attn_output_11 = attn_output_10.reshape(1, 20, 1024) + attn_output_10 = None + hidden_states_24 = torch._C._nn.linear( + attn_output_11, + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_dense_parameters_bias_, + ) + attn_output_11 = l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_25 = torch.nn.functional.dropout( + hidden_states_24, 0.1, False, False + ) + hidden_states_24 = None + add_9 = hidden_states_25 + hidden_states_23 + hidden_states_25 = hidden_states_23 = None + hidden_states_26 = torch.nn.functional.layer_norm( + add_9, + (1024,), + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_layer_norm_parameters_bias_, + 1e-05, + ) + add_9 = l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = (None) + hidden_states_27 = torch._C._nn.linear( + hidden_states_26, + l_self_modules_encoder_modules_layer_modules_3_modules_intermediate_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_3_modules_intermediate_modules_dense_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_3_modules_intermediate_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_3_modules_intermediate_modules_dense_parameters_bias_ = (None) + hidden_states_28 = torch._C._nn.gelu(hidden_states_27) + hidden_states_27 = None + hidden_states_29 = torch._C._nn.linear( + hidden_states_28, + l_self_modules_encoder_modules_layer_modules_3_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_3_modules_output_modules_dense_parameters_bias_, + ) + hidden_states_28 = l_self_modules_encoder_modules_layer_modules_3_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_3_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_30 = torch.nn.functional.dropout( + hidden_states_29, 0.1, False, False + ) + hidden_states_29 = None + add_10 = hidden_states_30 + hidden_states_26 + hidden_states_30 = hidden_states_26 = None + hidden_states_31 = torch.nn.functional.layer_norm( + add_10, + (1024,), + l_self_modules_encoder_modules_layer_modules_3_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_3_modules_output_modules_layer_norm_parameters_bias_, + 1e-05, + ) + add_10 = l_self_modules_encoder_modules_layer_modules_3_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_3_modules_output_modules_layer_norm_parameters_bias_ = (None) + linear_24 = torch._C._nn.linear( + hidden_states_31, + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_query_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_query_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_query_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_query_parameters_bias_ = (None) + view_12 = linear_24.view(1, -1, 16, 64) + linear_24 = None + query_layer_4 = view_12.transpose(1, 2) + view_12 = None + linear_25 = torch._C._nn.linear( + hidden_states_31, + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_key_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_key_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_key_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_key_parameters_bias_ = (None) + view_13 = linear_25.view(1, -1, 16, 64) + linear_25 = None + key_layer_4 = view_13.transpose(1, 2) + view_13 = None + linear_26 = torch._C._nn.linear( + hidden_states_31, + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_value_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_value_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_value_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_value_parameters_bias_ = (None) + view_14 = linear_26.view(1, -1, 16, 64) + linear_26 = None + value_layer_4 = view_14.transpose(1, 2) + view_14 = None + attn_output_12 = torch._C._nn.scaled_dot_product_attention( + query_layer_4, + key_layer_4, + value_layer_4, + attn_mask=extended_attention_mask, + dropout_p=0.0, + is_causal=False, + ) + query_layer_4 = key_layer_4 = value_layer_4 = None + attn_output_13 = attn_output_12.transpose(1, 2) + attn_output_12 = None + attn_output_14 = attn_output_13.reshape(1, 20, 1024) + attn_output_13 = None + hidden_states_32 = torch._C._nn.linear( + attn_output_14, + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_dense_parameters_bias_, + ) + attn_output_14 = l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_33 = torch.nn.functional.dropout( + hidden_states_32, 0.1, False, False + ) + hidden_states_32 = None + add_11 = hidden_states_33 + hidden_states_31 + hidden_states_33 = hidden_states_31 = None + hidden_states_34 = torch.nn.functional.layer_norm( + add_11, + (1024,), + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_layer_norm_parameters_bias_, + 1e-05, + ) + add_11 = l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = (None) + hidden_states_35 = torch._C._nn.linear( + hidden_states_34, + l_self_modules_encoder_modules_layer_modules_4_modules_intermediate_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_4_modules_intermediate_modules_dense_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_4_modules_intermediate_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_4_modules_intermediate_modules_dense_parameters_bias_ = (None) + hidden_states_36 = torch._C._nn.gelu(hidden_states_35) + hidden_states_35 = None + hidden_states_37 = torch._C._nn.linear( + hidden_states_36, + l_self_modules_encoder_modules_layer_modules_4_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_4_modules_output_modules_dense_parameters_bias_, + ) + hidden_states_36 = l_self_modules_encoder_modules_layer_modules_4_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_4_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_38 = torch.nn.functional.dropout( + hidden_states_37, 0.1, False, False + ) + hidden_states_37 = None + add_12 = hidden_states_38 + hidden_states_34 + hidden_states_38 = hidden_states_34 = None + hidden_states_39 = torch.nn.functional.layer_norm( + add_12, + (1024,), + l_self_modules_encoder_modules_layer_modules_4_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_4_modules_output_modules_layer_norm_parameters_bias_, + 1e-05, + ) + add_12 = l_self_modules_encoder_modules_layer_modules_4_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_4_modules_output_modules_layer_norm_parameters_bias_ = (None) + linear_30 = torch._C._nn.linear( + hidden_states_39, + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_query_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_query_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_query_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_query_parameters_bias_ = (None) + view_15 = linear_30.view(1, -1, 16, 64) + linear_30 = None + query_layer_5 = view_15.transpose(1, 2) + view_15 = None + linear_31 = torch._C._nn.linear( + hidden_states_39, + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_key_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_key_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_key_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_key_parameters_bias_ = (None) + view_16 = linear_31.view(1, -1, 16, 64) + linear_31 = None + key_layer_5 = view_16.transpose(1, 2) + view_16 = None + linear_32 = torch._C._nn.linear( + hidden_states_39, + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_value_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_value_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_value_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_value_parameters_bias_ = (None) + view_17 = linear_32.view(1, -1, 16, 64) + linear_32 = None + value_layer_5 = view_17.transpose(1, 2) + view_17 = None + attn_output_15 = torch._C._nn.scaled_dot_product_attention( + query_layer_5, + key_layer_5, + value_layer_5, + attn_mask=extended_attention_mask, + dropout_p=0.0, + is_causal=False, + ) + query_layer_5 = key_layer_5 = value_layer_5 = None + attn_output_16 = attn_output_15.transpose(1, 2) + attn_output_15 = None + attn_output_17 = attn_output_16.reshape(1, 20, 1024) + attn_output_16 = None + hidden_states_40 = torch._C._nn.linear( + attn_output_17, + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_dense_parameters_bias_, + ) + attn_output_17 = l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_41 = torch.nn.functional.dropout( + hidden_states_40, 0.1, False, False + ) + hidden_states_40 = None + add_13 = hidden_states_41 + hidden_states_39 + hidden_states_41 = hidden_states_39 = None + hidden_states_42 = torch.nn.functional.layer_norm( + add_13, + (1024,), + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_layer_norm_parameters_bias_, + 1e-05, + ) + add_13 = l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = (None) + hidden_states_43 = torch._C._nn.linear( + hidden_states_42, + l_self_modules_encoder_modules_layer_modules_5_modules_intermediate_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_5_modules_intermediate_modules_dense_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_5_modules_intermediate_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_5_modules_intermediate_modules_dense_parameters_bias_ = (None) + hidden_states_44 = torch._C._nn.gelu(hidden_states_43) + hidden_states_43 = None + hidden_states_45 = torch._C._nn.linear( + hidden_states_44, + l_self_modules_encoder_modules_layer_modules_5_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_5_modules_output_modules_dense_parameters_bias_, + ) + hidden_states_44 = l_self_modules_encoder_modules_layer_modules_5_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_5_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_46 = torch.nn.functional.dropout( + hidden_states_45, 0.1, False, False + ) + hidden_states_45 = None + add_14 = hidden_states_46 + hidden_states_42 + hidden_states_46 = hidden_states_42 = None + hidden_states_47 = torch.nn.functional.layer_norm( + add_14, + (1024,), + l_self_modules_encoder_modules_layer_modules_5_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_5_modules_output_modules_layer_norm_parameters_bias_, + 1e-05, + ) + add_14 = l_self_modules_encoder_modules_layer_modules_5_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_5_modules_output_modules_layer_norm_parameters_bias_ = (None) + linear_36 = torch._C._nn.linear( + hidden_states_47, + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_query_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_query_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_query_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_query_parameters_bias_ = (None) + view_18 = linear_36.view(1, -1, 16, 64) + linear_36 = None + query_layer_6 = view_18.transpose(1, 2) + view_18 = None + linear_37 = torch._C._nn.linear( + hidden_states_47, + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_key_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_key_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_key_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_key_parameters_bias_ = (None) + view_19 = linear_37.view(1, -1, 16, 64) + linear_37 = None + key_layer_6 = view_19.transpose(1, 2) + view_19 = None + linear_38 = torch._C._nn.linear( + hidden_states_47, + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_value_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_value_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_value_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_value_parameters_bias_ = (None) + view_20 = linear_38.view(1, -1, 16, 64) + linear_38 = None + value_layer_6 = view_20.transpose(1, 2) + view_20 = None + attn_output_18 = torch._C._nn.scaled_dot_product_attention( + query_layer_6, + key_layer_6, + value_layer_6, + attn_mask=extended_attention_mask, + dropout_p=0.0, + is_causal=False, + ) + query_layer_6 = key_layer_6 = value_layer_6 = None + attn_output_19 = attn_output_18.transpose(1, 2) + attn_output_18 = None + attn_output_20 = attn_output_19.reshape(1, 20, 1024) + attn_output_19 = None + hidden_states_48 = torch._C._nn.linear( + attn_output_20, + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_dense_parameters_bias_, + ) + attn_output_20 = l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_49 = torch.nn.functional.dropout( + hidden_states_48, 0.1, False, False + ) + hidden_states_48 = None + add_15 = hidden_states_49 + hidden_states_47 + hidden_states_49 = hidden_states_47 = None + hidden_states_50 = torch.nn.functional.layer_norm( + add_15, + (1024,), + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_layer_norm_parameters_bias_, + 1e-05, + ) + add_15 = l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = (None) + hidden_states_51 = torch._C._nn.linear( + hidden_states_50, + l_self_modules_encoder_modules_layer_modules_6_modules_intermediate_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_6_modules_intermediate_modules_dense_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_6_modules_intermediate_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_6_modules_intermediate_modules_dense_parameters_bias_ = (None) + hidden_states_52 = torch._C._nn.gelu(hidden_states_51) + hidden_states_51 = None + hidden_states_53 = torch._C._nn.linear( + hidden_states_52, + l_self_modules_encoder_modules_layer_modules_6_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_6_modules_output_modules_dense_parameters_bias_, + ) + hidden_states_52 = l_self_modules_encoder_modules_layer_modules_6_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_6_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_54 = torch.nn.functional.dropout( + hidden_states_53, 0.1, False, False + ) + hidden_states_53 = None + add_16 = hidden_states_54 + hidden_states_50 + hidden_states_54 = hidden_states_50 = None + hidden_states_55 = torch.nn.functional.layer_norm( + add_16, + (1024,), + l_self_modules_encoder_modules_layer_modules_6_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_6_modules_output_modules_layer_norm_parameters_bias_, + 1e-05, + ) + add_16 = l_self_modules_encoder_modules_layer_modules_6_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_6_modules_output_modules_layer_norm_parameters_bias_ = (None) + linear_42 = torch._C._nn.linear( + hidden_states_55, + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_query_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_query_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_query_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_query_parameters_bias_ = (None) + view_21 = linear_42.view(1, -1, 16, 64) + linear_42 = None + query_layer_7 = view_21.transpose(1, 2) + view_21 = None + linear_43 = torch._C._nn.linear( + hidden_states_55, + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_key_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_key_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_key_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_key_parameters_bias_ = (None) + view_22 = linear_43.view(1, -1, 16, 64) + linear_43 = None + key_layer_7 = view_22.transpose(1, 2) + view_22 = None + linear_44 = torch._C._nn.linear( + hidden_states_55, + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_value_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_value_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_value_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_value_parameters_bias_ = (None) + view_23 = linear_44.view(1, -1, 16, 64) + linear_44 = None + value_layer_7 = view_23.transpose(1, 2) + view_23 = None + attn_output_21 = torch._C._nn.scaled_dot_product_attention( + query_layer_7, + key_layer_7, + value_layer_7, + attn_mask=extended_attention_mask, + dropout_p=0.0, + is_causal=False, + ) + query_layer_7 = key_layer_7 = value_layer_7 = None + attn_output_22 = attn_output_21.transpose(1, 2) + attn_output_21 = None + attn_output_23 = attn_output_22.reshape(1, 20, 1024) + attn_output_22 = None + hidden_states_56 = torch._C._nn.linear( + attn_output_23, + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_dense_parameters_bias_, + ) + attn_output_23 = l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_57 = torch.nn.functional.dropout( + hidden_states_56, 0.1, False, False + ) + hidden_states_56 = None + add_17 = hidden_states_57 + hidden_states_55 + hidden_states_57 = hidden_states_55 = None + hidden_states_58 = torch.nn.functional.layer_norm( + add_17, + (1024,), + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_layer_norm_parameters_bias_, + 1e-05, + ) + add_17 = l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = (None) + hidden_states_59 = torch._C._nn.linear( + hidden_states_58, + l_self_modules_encoder_modules_layer_modules_7_modules_intermediate_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_7_modules_intermediate_modules_dense_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_7_modules_intermediate_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_7_modules_intermediate_modules_dense_parameters_bias_ = (None) + hidden_states_60 = torch._C._nn.gelu(hidden_states_59) + hidden_states_59 = None + hidden_states_61 = torch._C._nn.linear( + hidden_states_60, + l_self_modules_encoder_modules_layer_modules_7_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_7_modules_output_modules_dense_parameters_bias_, + ) + hidden_states_60 = l_self_modules_encoder_modules_layer_modules_7_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_7_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_62 = torch.nn.functional.dropout( + hidden_states_61, 0.1, False, False + ) + hidden_states_61 = None + add_18 = hidden_states_62 + hidden_states_58 + hidden_states_62 = hidden_states_58 = None + hidden_states_63 = torch.nn.functional.layer_norm( + add_18, + (1024,), + l_self_modules_encoder_modules_layer_modules_7_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_7_modules_output_modules_layer_norm_parameters_bias_, + 1e-05, + ) + add_18 = l_self_modules_encoder_modules_layer_modules_7_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_7_modules_output_modules_layer_norm_parameters_bias_ = (None) + linear_48 = torch._C._nn.linear( + hidden_states_63, + l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_query_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_query_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_query_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_query_parameters_bias_ = (None) + view_24 = linear_48.view(1, -1, 16, 64) + linear_48 = None + query_layer_8 = view_24.transpose(1, 2) + view_24 = None + linear_49 = torch._C._nn.linear( + hidden_states_63, + l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_key_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_key_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_key_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_key_parameters_bias_ = (None) + view_25 = linear_49.view(1, -1, 16, 64) + linear_49 = None + key_layer_8 = view_25.transpose(1, 2) + view_25 = None + linear_50 = torch._C._nn.linear( + hidden_states_63, + l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_value_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_value_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_value_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_value_parameters_bias_ = (None) + view_26 = linear_50.view(1, -1, 16, 64) + linear_50 = None + value_layer_8 = view_26.transpose(1, 2) + view_26 = None + attn_output_24 = torch._C._nn.scaled_dot_product_attention( + query_layer_8, + key_layer_8, + value_layer_8, + attn_mask=extended_attention_mask, + dropout_p=0.0, + is_causal=False, + ) + query_layer_8 = key_layer_8 = value_layer_8 = None + attn_output_25 = attn_output_24.transpose(1, 2) + attn_output_24 = None + attn_output_26 = attn_output_25.reshape(1, 20, 1024) + attn_output_25 = None + hidden_states_64 = torch._C._nn.linear( + attn_output_26, + l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_dense_parameters_bias_, + ) + attn_output_26 = l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_65 = torch.nn.functional.dropout( + hidden_states_64, 0.1, False, False + ) + hidden_states_64 = None + add_19 = hidden_states_65 + hidden_states_63 + hidden_states_65 = hidden_states_63 = None + hidden_states_66 = torch.nn.functional.layer_norm( + add_19, + (1024,), + l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_layer_norm_parameters_bias_, + 1e-05, + ) + add_19 = l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = (None) + hidden_states_67 = torch._C._nn.linear( + hidden_states_66, + l_self_modules_encoder_modules_layer_modules_8_modules_intermediate_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_8_modules_intermediate_modules_dense_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_8_modules_intermediate_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_8_modules_intermediate_modules_dense_parameters_bias_ = (None) + hidden_states_68 = torch._C._nn.gelu(hidden_states_67) + hidden_states_67 = None + hidden_states_69 = torch._C._nn.linear( + hidden_states_68, + l_self_modules_encoder_modules_layer_modules_8_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_8_modules_output_modules_dense_parameters_bias_, + ) + hidden_states_68 = l_self_modules_encoder_modules_layer_modules_8_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_8_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_70 = torch.nn.functional.dropout( + hidden_states_69, 0.1, False, False + ) + hidden_states_69 = None + add_20 = hidden_states_70 + hidden_states_66 + hidden_states_70 = hidden_states_66 = None + hidden_states_71 = torch.nn.functional.layer_norm( + add_20, + (1024,), + l_self_modules_encoder_modules_layer_modules_8_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_8_modules_output_modules_layer_norm_parameters_bias_, + 1e-05, + ) + add_20 = l_self_modules_encoder_modules_layer_modules_8_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_8_modules_output_modules_layer_norm_parameters_bias_ = (None) + linear_54 = torch._C._nn.linear( + hidden_states_71, + l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_query_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_query_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_query_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_query_parameters_bias_ = (None) + view_27 = linear_54.view(1, -1, 16, 64) + linear_54 = None + query_layer_9 = view_27.transpose(1, 2) + view_27 = None + linear_55 = torch._C._nn.linear( + hidden_states_71, + l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_key_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_key_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_key_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_key_parameters_bias_ = (None) + view_28 = linear_55.view(1, -1, 16, 64) + linear_55 = None + key_layer_9 = view_28.transpose(1, 2) + view_28 = None + linear_56 = torch._C._nn.linear( + hidden_states_71, + l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_value_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_value_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_value_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_value_parameters_bias_ = (None) + view_29 = linear_56.view(1, -1, 16, 64) + linear_56 = None + value_layer_9 = view_29.transpose(1, 2) + view_29 = None + attn_output_27 = torch._C._nn.scaled_dot_product_attention( + query_layer_9, + key_layer_9, + value_layer_9, + attn_mask=extended_attention_mask, + dropout_p=0.0, + is_causal=False, + ) + query_layer_9 = key_layer_9 = value_layer_9 = None + attn_output_28 = attn_output_27.transpose(1, 2) + attn_output_27 = None + attn_output_29 = attn_output_28.reshape(1, 20, 1024) + attn_output_28 = None + hidden_states_72 = torch._C._nn.linear( + attn_output_29, + l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_dense_parameters_bias_, + ) + attn_output_29 = l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_73 = torch.nn.functional.dropout( + hidden_states_72, 0.1, False, False + ) + hidden_states_72 = None + add_21 = hidden_states_73 + hidden_states_71 + hidden_states_73 = hidden_states_71 = None + hidden_states_74 = torch.nn.functional.layer_norm( + add_21, + (1024,), + l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_layer_norm_parameters_bias_, + 1e-05, + ) + add_21 = l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = (None) + hidden_states_75 = torch._C._nn.linear( + hidden_states_74, + l_self_modules_encoder_modules_layer_modules_9_modules_intermediate_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_9_modules_intermediate_modules_dense_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_9_modules_intermediate_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_9_modules_intermediate_modules_dense_parameters_bias_ = (None) + hidden_states_76 = torch._C._nn.gelu(hidden_states_75) + hidden_states_75 = None + hidden_states_77 = torch._C._nn.linear( + hidden_states_76, + l_self_modules_encoder_modules_layer_modules_9_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_9_modules_output_modules_dense_parameters_bias_, + ) + hidden_states_76 = l_self_modules_encoder_modules_layer_modules_9_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_9_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_78 = torch.nn.functional.dropout( + hidden_states_77, 0.1, False, False + ) + hidden_states_77 = None + add_22 = hidden_states_78 + hidden_states_74 + hidden_states_78 = hidden_states_74 = None + hidden_states_79 = torch.nn.functional.layer_norm( + add_22, + (1024,), + l_self_modules_encoder_modules_layer_modules_9_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_9_modules_output_modules_layer_norm_parameters_bias_, + 1e-05, + ) + add_22 = l_self_modules_encoder_modules_layer_modules_9_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_9_modules_output_modules_layer_norm_parameters_bias_ = (None) + linear_60 = torch._C._nn.linear( + hidden_states_79, + l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_query_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_query_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_query_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_query_parameters_bias_ = (None) + view_30 = linear_60.view(1, -1, 16, 64) + linear_60 = None + query_layer_10 = view_30.transpose(1, 2) + view_30 = None + linear_61 = torch._C._nn.linear( + hidden_states_79, + l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_key_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_key_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_key_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_key_parameters_bias_ = (None) + view_31 = linear_61.view(1, -1, 16, 64) + linear_61 = None + key_layer_10 = view_31.transpose(1, 2) + view_31 = None + linear_62 = torch._C._nn.linear( + hidden_states_79, + l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_value_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_value_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_value_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_value_parameters_bias_ = (None) + view_32 = linear_62.view(1, -1, 16, 64) + linear_62 = None + value_layer_10 = view_32.transpose(1, 2) + view_32 = None + attn_output_30 = torch._C._nn.scaled_dot_product_attention( + query_layer_10, + key_layer_10, + value_layer_10, + attn_mask=extended_attention_mask, + dropout_p=0.0, + is_causal=False, + ) + query_layer_10 = key_layer_10 = value_layer_10 = None + attn_output_31 = attn_output_30.transpose(1, 2) + attn_output_30 = None + attn_output_32 = attn_output_31.reshape(1, 20, 1024) + attn_output_31 = None + hidden_states_80 = torch._C._nn.linear( + attn_output_32, + l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_dense_parameters_bias_, + ) + attn_output_32 = l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_81 = torch.nn.functional.dropout( + hidden_states_80, 0.1, False, False + ) + hidden_states_80 = None + add_23 = hidden_states_81 + hidden_states_79 + hidden_states_81 = hidden_states_79 = None + hidden_states_82 = torch.nn.functional.layer_norm( + add_23, + (1024,), + l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_layer_norm_parameters_bias_, + 1e-05, + ) + add_23 = l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = (None) + hidden_states_83 = torch._C._nn.linear( + hidden_states_82, + l_self_modules_encoder_modules_layer_modules_10_modules_intermediate_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_10_modules_intermediate_modules_dense_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_10_modules_intermediate_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_10_modules_intermediate_modules_dense_parameters_bias_ = (None) + hidden_states_84 = torch._C._nn.gelu(hidden_states_83) + hidden_states_83 = None + hidden_states_85 = torch._C._nn.linear( + hidden_states_84, + l_self_modules_encoder_modules_layer_modules_10_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_10_modules_output_modules_dense_parameters_bias_, + ) + hidden_states_84 = l_self_modules_encoder_modules_layer_modules_10_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_10_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_86 = torch.nn.functional.dropout( + hidden_states_85, 0.1, False, False + ) + hidden_states_85 = None + add_24 = hidden_states_86 + hidden_states_82 + hidden_states_86 = hidden_states_82 = None + hidden_states_87 = torch.nn.functional.layer_norm( + add_24, + (1024,), + l_self_modules_encoder_modules_layer_modules_10_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_10_modules_output_modules_layer_norm_parameters_bias_, + 1e-05, + ) + add_24 = l_self_modules_encoder_modules_layer_modules_10_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_10_modules_output_modules_layer_norm_parameters_bias_ = (None) + linear_66 = torch._C._nn.linear( + hidden_states_87, + l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_query_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_query_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_query_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_query_parameters_bias_ = (None) + view_33 = linear_66.view(1, -1, 16, 64) + linear_66 = None + query_layer_11 = view_33.transpose(1, 2) + view_33 = None + linear_67 = torch._C._nn.linear( + hidden_states_87, + l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_key_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_key_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_key_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_key_parameters_bias_ = (None) + view_34 = linear_67.view(1, -1, 16, 64) + linear_67 = None + key_layer_11 = view_34.transpose(1, 2) + view_34 = None + linear_68 = torch._C._nn.linear( + hidden_states_87, + l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_value_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_value_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_value_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_value_parameters_bias_ = (None) + view_35 = linear_68.view(1, -1, 16, 64) + linear_68 = None + value_layer_11 = view_35.transpose(1, 2) + view_35 = None + attn_output_33 = torch._C._nn.scaled_dot_product_attention( + query_layer_11, + key_layer_11, + value_layer_11, + attn_mask=extended_attention_mask, + dropout_p=0.0, + is_causal=False, + ) + query_layer_11 = key_layer_11 = value_layer_11 = None + attn_output_34 = attn_output_33.transpose(1, 2) + attn_output_33 = None + attn_output_35 = attn_output_34.reshape(1, 20, 1024) + attn_output_34 = None + hidden_states_88 = torch._C._nn.linear( + attn_output_35, + l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_dense_parameters_bias_, + ) + attn_output_35 = l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_89 = torch.nn.functional.dropout( + hidden_states_88, 0.1, False, False + ) + hidden_states_88 = None + add_25 = hidden_states_89 + hidden_states_87 + hidden_states_89 = hidden_states_87 = None + hidden_states_90 = torch.nn.functional.layer_norm( + add_25, + (1024,), + l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_layer_norm_parameters_bias_, + 1e-05, + ) + add_25 = l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = (None) + hidden_states_91 = torch._C._nn.linear( + hidden_states_90, + l_self_modules_encoder_modules_layer_modules_11_modules_intermediate_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_11_modules_intermediate_modules_dense_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_11_modules_intermediate_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_11_modules_intermediate_modules_dense_parameters_bias_ = (None) + hidden_states_92 = torch._C._nn.gelu(hidden_states_91) + hidden_states_91 = None + hidden_states_93 = torch._C._nn.linear( + hidden_states_92, + l_self_modules_encoder_modules_layer_modules_11_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_11_modules_output_modules_dense_parameters_bias_, + ) + hidden_states_92 = l_self_modules_encoder_modules_layer_modules_11_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_11_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_94 = torch.nn.functional.dropout( + hidden_states_93, 0.1, False, False + ) + hidden_states_93 = None + add_26 = hidden_states_94 + hidden_states_90 + hidden_states_94 = hidden_states_90 = None + hidden_states_95 = torch.nn.functional.layer_norm( + add_26, + (1024,), + l_self_modules_encoder_modules_layer_modules_11_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_11_modules_output_modules_layer_norm_parameters_bias_, + 1e-05, + ) + add_26 = l_self_modules_encoder_modules_layer_modules_11_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_11_modules_output_modules_layer_norm_parameters_bias_ = (None) + linear_72 = torch._C._nn.linear( + hidden_states_95, + l_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_self_modules_query_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_self_modules_query_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_self_modules_query_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_self_modules_query_parameters_bias_ = (None) + view_36 = linear_72.view(1, -1, 16, 64) + linear_72 = None + query_layer_12 = view_36.transpose(1, 2) + view_36 = None + linear_73 = torch._C._nn.linear( + hidden_states_95, + l_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_self_modules_key_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_self_modules_key_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_self_modules_key_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_self_modules_key_parameters_bias_ = (None) + view_37 = linear_73.view(1, -1, 16, 64) + linear_73 = None + key_layer_12 = view_37.transpose(1, 2) + view_37 = None + linear_74 = torch._C._nn.linear( + hidden_states_95, + l_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_self_modules_value_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_self_modules_value_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_self_modules_value_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_self_modules_value_parameters_bias_ = (None) + view_38 = linear_74.view(1, -1, 16, 64) + linear_74 = None + value_layer_12 = view_38.transpose(1, 2) + view_38 = None + attn_output_36 = torch._C._nn.scaled_dot_product_attention( + query_layer_12, + key_layer_12, + value_layer_12, + attn_mask=extended_attention_mask, + dropout_p=0.0, + is_causal=False, + ) + query_layer_12 = key_layer_12 = value_layer_12 = None + attn_output_37 = attn_output_36.transpose(1, 2) + attn_output_36 = None + attn_output_38 = attn_output_37.reshape(1, 20, 1024) + attn_output_37 = None + hidden_states_96 = torch._C._nn.linear( + attn_output_38, + l_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_output_modules_dense_parameters_bias_, + ) + attn_output_38 = l_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_97 = torch.nn.functional.dropout( + hidden_states_96, 0.1, False, False + ) + hidden_states_96 = None + add_27 = hidden_states_97 + hidden_states_95 + hidden_states_97 = hidden_states_95 = None + hidden_states_98 = torch.nn.functional.layer_norm( + add_27, + (1024,), + l_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_output_modules_layer_norm_parameters_bias_, + 1e-05, + ) + add_27 = l_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = (None) + hidden_states_99 = torch._C._nn.linear( + hidden_states_98, + l_self_modules_encoder_modules_layer_modules_12_modules_intermediate_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_12_modules_intermediate_modules_dense_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_12_modules_intermediate_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_12_modules_intermediate_modules_dense_parameters_bias_ = (None) + hidden_states_100 = torch._C._nn.gelu(hidden_states_99) + hidden_states_99 = None + hidden_states_101 = torch._C._nn.linear( + hidden_states_100, + l_self_modules_encoder_modules_layer_modules_12_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_12_modules_output_modules_dense_parameters_bias_, + ) + hidden_states_100 = l_self_modules_encoder_modules_layer_modules_12_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_12_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_102 = torch.nn.functional.dropout( + hidden_states_101, 0.1, False, False + ) + hidden_states_101 = None + add_28 = hidden_states_102 + hidden_states_98 + hidden_states_102 = hidden_states_98 = None + hidden_states_103 = torch.nn.functional.layer_norm( + add_28, + (1024,), + l_self_modules_encoder_modules_layer_modules_12_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_12_modules_output_modules_layer_norm_parameters_bias_, + 1e-05, + ) + add_28 = l_self_modules_encoder_modules_layer_modules_12_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_12_modules_output_modules_layer_norm_parameters_bias_ = (None) + linear_78 = torch._C._nn.linear( + hidden_states_103, + l_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_self_modules_query_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_self_modules_query_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_self_modules_query_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_self_modules_query_parameters_bias_ = (None) + view_39 = linear_78.view(1, -1, 16, 64) + linear_78 = None + query_layer_13 = view_39.transpose(1, 2) + view_39 = None + linear_79 = torch._C._nn.linear( + hidden_states_103, + l_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_self_modules_key_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_self_modules_key_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_self_modules_key_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_self_modules_key_parameters_bias_ = (None) + view_40 = linear_79.view(1, -1, 16, 64) + linear_79 = None + key_layer_13 = view_40.transpose(1, 2) + view_40 = None + linear_80 = torch._C._nn.linear( + hidden_states_103, + l_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_self_modules_value_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_self_modules_value_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_self_modules_value_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_self_modules_value_parameters_bias_ = (None) + view_41 = linear_80.view(1, -1, 16, 64) + linear_80 = None + value_layer_13 = view_41.transpose(1, 2) + view_41 = None + attn_output_39 = torch._C._nn.scaled_dot_product_attention( + query_layer_13, + key_layer_13, + value_layer_13, + attn_mask=extended_attention_mask, + dropout_p=0.0, + is_causal=False, + ) + query_layer_13 = key_layer_13 = value_layer_13 = None + attn_output_40 = attn_output_39.transpose(1, 2) + attn_output_39 = None + attn_output_41 = attn_output_40.reshape(1, 20, 1024) + attn_output_40 = None + hidden_states_104 = torch._C._nn.linear( + attn_output_41, + l_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_output_modules_dense_parameters_bias_, + ) + attn_output_41 = l_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_105 = torch.nn.functional.dropout( + hidden_states_104, 0.1, False, False + ) + hidden_states_104 = None + add_29 = hidden_states_105 + hidden_states_103 + hidden_states_105 = hidden_states_103 = None + hidden_states_106 = torch.nn.functional.layer_norm( + add_29, + (1024,), + l_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_output_modules_layer_norm_parameters_bias_, + 1e-05, + ) + add_29 = l_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = (None) + hidden_states_107 = torch._C._nn.linear( + hidden_states_106, + l_self_modules_encoder_modules_layer_modules_13_modules_intermediate_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_13_modules_intermediate_modules_dense_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_13_modules_intermediate_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_13_modules_intermediate_modules_dense_parameters_bias_ = (None) + hidden_states_108 = torch._C._nn.gelu(hidden_states_107) + hidden_states_107 = None + hidden_states_109 = torch._C._nn.linear( + hidden_states_108, + l_self_modules_encoder_modules_layer_modules_13_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_13_modules_output_modules_dense_parameters_bias_, + ) + hidden_states_108 = l_self_modules_encoder_modules_layer_modules_13_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_13_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_110 = torch.nn.functional.dropout( + hidden_states_109, 0.1, False, False + ) + hidden_states_109 = None + add_30 = hidden_states_110 + hidden_states_106 + hidden_states_110 = hidden_states_106 = None + hidden_states_111 = torch.nn.functional.layer_norm( + add_30, + (1024,), + l_self_modules_encoder_modules_layer_modules_13_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_13_modules_output_modules_layer_norm_parameters_bias_, + 1e-05, + ) + add_30 = l_self_modules_encoder_modules_layer_modules_13_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_13_modules_output_modules_layer_norm_parameters_bias_ = (None) + linear_84 = torch._C._nn.linear( + hidden_states_111, + l_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_self_modules_query_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_self_modules_query_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_self_modules_query_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_self_modules_query_parameters_bias_ = (None) + view_42 = linear_84.view(1, -1, 16, 64) + linear_84 = None + query_layer_14 = view_42.transpose(1, 2) + view_42 = None + linear_85 = torch._C._nn.linear( + hidden_states_111, + l_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_self_modules_key_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_self_modules_key_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_self_modules_key_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_self_modules_key_parameters_bias_ = (None) + view_43 = linear_85.view(1, -1, 16, 64) + linear_85 = None + key_layer_14 = view_43.transpose(1, 2) + view_43 = None + linear_86 = torch._C._nn.linear( + hidden_states_111, + l_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_self_modules_value_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_self_modules_value_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_self_modules_value_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_self_modules_value_parameters_bias_ = (None) + view_44 = linear_86.view(1, -1, 16, 64) + linear_86 = None + value_layer_14 = view_44.transpose(1, 2) + view_44 = None + attn_output_42 = torch._C._nn.scaled_dot_product_attention( + query_layer_14, + key_layer_14, + value_layer_14, + attn_mask=extended_attention_mask, + dropout_p=0.0, + is_causal=False, + ) + query_layer_14 = key_layer_14 = value_layer_14 = None + attn_output_43 = attn_output_42.transpose(1, 2) + attn_output_42 = None + attn_output_44 = attn_output_43.reshape(1, 20, 1024) + attn_output_43 = None + hidden_states_112 = torch._C._nn.linear( + attn_output_44, + l_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_output_modules_dense_parameters_bias_, + ) + attn_output_44 = l_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_113 = torch.nn.functional.dropout( + hidden_states_112, 0.1, False, False + ) + hidden_states_112 = None + add_31 = hidden_states_113 + hidden_states_111 + hidden_states_113 = hidden_states_111 = None + hidden_states_114 = torch.nn.functional.layer_norm( + add_31, + (1024,), + l_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_output_modules_layer_norm_parameters_bias_, + 1e-05, + ) + add_31 = l_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = (None) + hidden_states_115 = torch._C._nn.linear( + hidden_states_114, + l_self_modules_encoder_modules_layer_modules_14_modules_intermediate_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_14_modules_intermediate_modules_dense_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_14_modules_intermediate_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_14_modules_intermediate_modules_dense_parameters_bias_ = (None) + hidden_states_116 = torch._C._nn.gelu(hidden_states_115) + hidden_states_115 = None + hidden_states_117 = torch._C._nn.linear( + hidden_states_116, + l_self_modules_encoder_modules_layer_modules_14_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_14_modules_output_modules_dense_parameters_bias_, + ) + hidden_states_116 = l_self_modules_encoder_modules_layer_modules_14_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_14_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_118 = torch.nn.functional.dropout( + hidden_states_117, 0.1, False, False + ) + hidden_states_117 = None + add_32 = hidden_states_118 + hidden_states_114 + hidden_states_118 = hidden_states_114 = None + hidden_states_119 = torch.nn.functional.layer_norm( + add_32, + (1024,), + l_self_modules_encoder_modules_layer_modules_14_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_14_modules_output_modules_layer_norm_parameters_bias_, + 1e-05, + ) + add_32 = l_self_modules_encoder_modules_layer_modules_14_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_14_modules_output_modules_layer_norm_parameters_bias_ = (None) + linear_90 = torch._C._nn.linear( + hidden_states_119, + l_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_self_modules_query_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_self_modules_query_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_self_modules_query_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_self_modules_query_parameters_bias_ = (None) + view_45 = linear_90.view(1, -1, 16, 64) + linear_90 = None + query_layer_15 = view_45.transpose(1, 2) + view_45 = None + linear_91 = torch._C._nn.linear( + hidden_states_119, + l_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_self_modules_key_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_self_modules_key_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_self_modules_key_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_self_modules_key_parameters_bias_ = (None) + view_46 = linear_91.view(1, -1, 16, 64) + linear_91 = None + key_layer_15 = view_46.transpose(1, 2) + view_46 = None + linear_92 = torch._C._nn.linear( + hidden_states_119, + l_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_self_modules_value_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_self_modules_value_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_self_modules_value_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_self_modules_value_parameters_bias_ = (None) + view_47 = linear_92.view(1, -1, 16, 64) + linear_92 = None + value_layer_15 = view_47.transpose(1, 2) + view_47 = None + attn_output_45 = torch._C._nn.scaled_dot_product_attention( + query_layer_15, + key_layer_15, + value_layer_15, + attn_mask=extended_attention_mask, + dropout_p=0.0, + is_causal=False, + ) + query_layer_15 = key_layer_15 = value_layer_15 = None + attn_output_46 = attn_output_45.transpose(1, 2) + attn_output_45 = None + attn_output_47 = attn_output_46.reshape(1, 20, 1024) + attn_output_46 = None + hidden_states_120 = torch._C._nn.linear( + attn_output_47, + l_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_output_modules_dense_parameters_bias_, + ) + attn_output_47 = l_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_121 = torch.nn.functional.dropout( + hidden_states_120, 0.1, False, False + ) + hidden_states_120 = None + add_33 = hidden_states_121 + hidden_states_119 + hidden_states_121 = hidden_states_119 = None + hidden_states_122 = torch.nn.functional.layer_norm( + add_33, + (1024,), + l_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_output_modules_layer_norm_parameters_bias_, + 1e-05, + ) + add_33 = l_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = (None) + hidden_states_123 = torch._C._nn.linear( + hidden_states_122, + l_self_modules_encoder_modules_layer_modules_15_modules_intermediate_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_15_modules_intermediate_modules_dense_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_15_modules_intermediate_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_15_modules_intermediate_modules_dense_parameters_bias_ = (None) + hidden_states_124 = torch._C._nn.gelu(hidden_states_123) + hidden_states_123 = None + hidden_states_125 = torch._C._nn.linear( + hidden_states_124, + l_self_modules_encoder_modules_layer_modules_15_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_15_modules_output_modules_dense_parameters_bias_, + ) + hidden_states_124 = l_self_modules_encoder_modules_layer_modules_15_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_15_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_126 = torch.nn.functional.dropout( + hidden_states_125, 0.1, False, False + ) + hidden_states_125 = None + add_34 = hidden_states_126 + hidden_states_122 + hidden_states_126 = hidden_states_122 = None + hidden_states_127 = torch.nn.functional.layer_norm( + add_34, + (1024,), + l_self_modules_encoder_modules_layer_modules_15_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_15_modules_output_modules_layer_norm_parameters_bias_, + 1e-05, + ) + add_34 = l_self_modules_encoder_modules_layer_modules_15_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_15_modules_output_modules_layer_norm_parameters_bias_ = (None) + linear_96 = torch._C._nn.linear( + hidden_states_127, + l_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_self_modules_query_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_self_modules_query_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_self_modules_query_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_self_modules_query_parameters_bias_ = (None) + view_48 = linear_96.view(1, -1, 16, 64) + linear_96 = None + query_layer_16 = view_48.transpose(1, 2) + view_48 = None + linear_97 = torch._C._nn.linear( + hidden_states_127, + l_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_self_modules_key_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_self_modules_key_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_self_modules_key_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_self_modules_key_parameters_bias_ = (None) + view_49 = linear_97.view(1, -1, 16, 64) + linear_97 = None + key_layer_16 = view_49.transpose(1, 2) + view_49 = None + linear_98 = torch._C._nn.linear( + hidden_states_127, + l_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_self_modules_value_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_self_modules_value_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_self_modules_value_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_self_modules_value_parameters_bias_ = (None) + view_50 = linear_98.view(1, -1, 16, 64) + linear_98 = None + value_layer_16 = view_50.transpose(1, 2) + view_50 = None + attn_output_48 = torch._C._nn.scaled_dot_product_attention( + query_layer_16, + key_layer_16, + value_layer_16, + attn_mask=extended_attention_mask, + dropout_p=0.0, + is_causal=False, + ) + query_layer_16 = key_layer_16 = value_layer_16 = None + attn_output_49 = attn_output_48.transpose(1, 2) + attn_output_48 = None + attn_output_50 = attn_output_49.reshape(1, 20, 1024) + attn_output_49 = None + hidden_states_128 = torch._C._nn.linear( + attn_output_50, + l_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_output_modules_dense_parameters_bias_, + ) + attn_output_50 = l_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_129 = torch.nn.functional.dropout( + hidden_states_128, 0.1, False, False + ) + hidden_states_128 = None + add_35 = hidden_states_129 + hidden_states_127 + hidden_states_129 = hidden_states_127 = None + hidden_states_130 = torch.nn.functional.layer_norm( + add_35, + (1024,), + l_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_output_modules_layer_norm_parameters_bias_, + 1e-05, + ) + add_35 = l_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = (None) + hidden_states_131 = torch._C._nn.linear( + hidden_states_130, + l_self_modules_encoder_modules_layer_modules_16_modules_intermediate_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_16_modules_intermediate_modules_dense_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_16_modules_intermediate_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_16_modules_intermediate_modules_dense_parameters_bias_ = (None) + hidden_states_132 = torch._C._nn.gelu(hidden_states_131) + hidden_states_131 = None + hidden_states_133 = torch._C._nn.linear( + hidden_states_132, + l_self_modules_encoder_modules_layer_modules_16_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_16_modules_output_modules_dense_parameters_bias_, + ) + hidden_states_132 = l_self_modules_encoder_modules_layer_modules_16_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_16_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_134 = torch.nn.functional.dropout( + hidden_states_133, 0.1, False, False + ) + hidden_states_133 = None + add_36 = hidden_states_134 + hidden_states_130 + hidden_states_134 = hidden_states_130 = None + hidden_states_135 = torch.nn.functional.layer_norm( + add_36, + (1024,), + l_self_modules_encoder_modules_layer_modules_16_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_16_modules_output_modules_layer_norm_parameters_bias_, + 1e-05, + ) + add_36 = l_self_modules_encoder_modules_layer_modules_16_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_16_modules_output_modules_layer_norm_parameters_bias_ = (None) + linear_102 = torch._C._nn.linear( + hidden_states_135, + l_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_self_modules_query_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_self_modules_query_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_self_modules_query_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_self_modules_query_parameters_bias_ = (None) + view_51 = linear_102.view(1, -1, 16, 64) + linear_102 = None + query_layer_17 = view_51.transpose(1, 2) + view_51 = None + linear_103 = torch._C._nn.linear( + hidden_states_135, + l_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_self_modules_key_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_self_modules_key_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_self_modules_key_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_self_modules_key_parameters_bias_ = (None) + view_52 = linear_103.view(1, -1, 16, 64) + linear_103 = None + key_layer_17 = view_52.transpose(1, 2) + view_52 = None + linear_104 = torch._C._nn.linear( + hidden_states_135, + l_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_self_modules_value_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_self_modules_value_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_self_modules_value_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_self_modules_value_parameters_bias_ = (None) + view_53 = linear_104.view(1, -1, 16, 64) + linear_104 = None + value_layer_17 = view_53.transpose(1, 2) + view_53 = None + attn_output_51 = torch._C._nn.scaled_dot_product_attention( + query_layer_17, + key_layer_17, + value_layer_17, + attn_mask=extended_attention_mask, + dropout_p=0.0, + is_causal=False, + ) + query_layer_17 = key_layer_17 = value_layer_17 = None + attn_output_52 = attn_output_51.transpose(1, 2) + attn_output_51 = None + attn_output_53 = attn_output_52.reshape(1, 20, 1024) + attn_output_52 = None + hidden_states_136 = torch._C._nn.linear( + attn_output_53, + l_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_output_modules_dense_parameters_bias_, + ) + attn_output_53 = l_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_137 = torch.nn.functional.dropout( + hidden_states_136, 0.1, False, False + ) + hidden_states_136 = None + add_37 = hidden_states_137 + hidden_states_135 + hidden_states_137 = hidden_states_135 = None + hidden_states_138 = torch.nn.functional.layer_norm( + add_37, + (1024,), + l_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_output_modules_layer_norm_parameters_bias_, + 1e-05, + ) + add_37 = l_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = (None) + hidden_states_139 = torch._C._nn.linear( + hidden_states_138, + l_self_modules_encoder_modules_layer_modules_17_modules_intermediate_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_17_modules_intermediate_modules_dense_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_17_modules_intermediate_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_17_modules_intermediate_modules_dense_parameters_bias_ = (None) + hidden_states_140 = torch._C._nn.gelu(hidden_states_139) + hidden_states_139 = None + hidden_states_141 = torch._C._nn.linear( + hidden_states_140, + l_self_modules_encoder_modules_layer_modules_17_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_17_modules_output_modules_dense_parameters_bias_, + ) + hidden_states_140 = l_self_modules_encoder_modules_layer_modules_17_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_17_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_142 = torch.nn.functional.dropout( + hidden_states_141, 0.1, False, False + ) + hidden_states_141 = None + add_38 = hidden_states_142 + hidden_states_138 + hidden_states_142 = hidden_states_138 = None + hidden_states_143 = torch.nn.functional.layer_norm( + add_38, + (1024,), + l_self_modules_encoder_modules_layer_modules_17_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_17_modules_output_modules_layer_norm_parameters_bias_, + 1e-05, + ) + add_38 = l_self_modules_encoder_modules_layer_modules_17_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_17_modules_output_modules_layer_norm_parameters_bias_ = (None) + linear_108 = torch._C._nn.linear( + hidden_states_143, + l_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_self_modules_query_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_self_modules_query_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_self_modules_query_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_self_modules_query_parameters_bias_ = (None) + view_54 = linear_108.view(1, -1, 16, 64) + linear_108 = None + query_layer_18 = view_54.transpose(1, 2) + view_54 = None + linear_109 = torch._C._nn.linear( + hidden_states_143, + l_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_self_modules_key_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_self_modules_key_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_self_modules_key_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_self_modules_key_parameters_bias_ = (None) + view_55 = linear_109.view(1, -1, 16, 64) + linear_109 = None + key_layer_18 = view_55.transpose(1, 2) + view_55 = None + linear_110 = torch._C._nn.linear( + hidden_states_143, + l_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_self_modules_value_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_self_modules_value_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_self_modules_value_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_self_modules_value_parameters_bias_ = (None) + view_56 = linear_110.view(1, -1, 16, 64) + linear_110 = None + value_layer_18 = view_56.transpose(1, 2) + view_56 = None + attn_output_54 = torch._C._nn.scaled_dot_product_attention( + query_layer_18, + key_layer_18, + value_layer_18, + attn_mask=extended_attention_mask, + dropout_p=0.0, + is_causal=False, + ) + query_layer_18 = key_layer_18 = value_layer_18 = None + attn_output_55 = attn_output_54.transpose(1, 2) + attn_output_54 = None + attn_output_56 = attn_output_55.reshape(1, 20, 1024) + attn_output_55 = None + hidden_states_144 = torch._C._nn.linear( + attn_output_56, + l_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_output_modules_dense_parameters_bias_, + ) + attn_output_56 = l_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_145 = torch.nn.functional.dropout( + hidden_states_144, 0.1, False, False + ) + hidden_states_144 = None + add_39 = hidden_states_145 + hidden_states_143 + hidden_states_145 = hidden_states_143 = None + hidden_states_146 = torch.nn.functional.layer_norm( + add_39, + (1024,), + l_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_output_modules_layer_norm_parameters_bias_, + 1e-05, + ) + add_39 = l_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = (None) + hidden_states_147 = torch._C._nn.linear( + hidden_states_146, + l_self_modules_encoder_modules_layer_modules_18_modules_intermediate_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_18_modules_intermediate_modules_dense_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_18_modules_intermediate_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_18_modules_intermediate_modules_dense_parameters_bias_ = (None) + hidden_states_148 = torch._C._nn.gelu(hidden_states_147) + hidden_states_147 = None + hidden_states_149 = torch._C._nn.linear( + hidden_states_148, + l_self_modules_encoder_modules_layer_modules_18_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_18_modules_output_modules_dense_parameters_bias_, + ) + hidden_states_148 = l_self_modules_encoder_modules_layer_modules_18_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_18_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_150 = torch.nn.functional.dropout( + hidden_states_149, 0.1, False, False + ) + hidden_states_149 = None + add_40 = hidden_states_150 + hidden_states_146 + hidden_states_150 = hidden_states_146 = None + hidden_states_151 = torch.nn.functional.layer_norm( + add_40, + (1024,), + l_self_modules_encoder_modules_layer_modules_18_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_18_modules_output_modules_layer_norm_parameters_bias_, + 1e-05, + ) + add_40 = l_self_modules_encoder_modules_layer_modules_18_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_18_modules_output_modules_layer_norm_parameters_bias_ = (None) + linear_114 = torch._C._nn.linear( + hidden_states_151, + l_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_self_modules_query_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_self_modules_query_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_self_modules_query_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_self_modules_query_parameters_bias_ = (None) + view_57 = linear_114.view(1, -1, 16, 64) + linear_114 = None + query_layer_19 = view_57.transpose(1, 2) + view_57 = None + linear_115 = torch._C._nn.linear( + hidden_states_151, + l_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_self_modules_key_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_self_modules_key_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_self_modules_key_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_self_modules_key_parameters_bias_ = (None) + view_58 = linear_115.view(1, -1, 16, 64) + linear_115 = None + key_layer_19 = view_58.transpose(1, 2) + view_58 = None + linear_116 = torch._C._nn.linear( + hidden_states_151, + l_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_self_modules_value_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_self_modules_value_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_self_modules_value_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_self_modules_value_parameters_bias_ = (None) + view_59 = linear_116.view(1, -1, 16, 64) + linear_116 = None + value_layer_19 = view_59.transpose(1, 2) + view_59 = None + attn_output_57 = torch._C._nn.scaled_dot_product_attention( + query_layer_19, + key_layer_19, + value_layer_19, + attn_mask=extended_attention_mask, + dropout_p=0.0, + is_causal=False, + ) + query_layer_19 = key_layer_19 = value_layer_19 = None + attn_output_58 = attn_output_57.transpose(1, 2) + attn_output_57 = None + attn_output_59 = attn_output_58.reshape(1, 20, 1024) + attn_output_58 = None + hidden_states_152 = torch._C._nn.linear( + attn_output_59, + l_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_output_modules_dense_parameters_bias_, + ) + attn_output_59 = l_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_153 = torch.nn.functional.dropout( + hidden_states_152, 0.1, False, False + ) + hidden_states_152 = None + add_41 = hidden_states_153 + hidden_states_151 + hidden_states_153 = hidden_states_151 = None + hidden_states_154 = torch.nn.functional.layer_norm( + add_41, + (1024,), + l_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_output_modules_layer_norm_parameters_bias_, + 1e-05, + ) + add_41 = l_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = (None) + hidden_states_155 = torch._C._nn.linear( + hidden_states_154, + l_self_modules_encoder_modules_layer_modules_19_modules_intermediate_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_19_modules_intermediate_modules_dense_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_19_modules_intermediate_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_19_modules_intermediate_modules_dense_parameters_bias_ = (None) + hidden_states_156 = torch._C._nn.gelu(hidden_states_155) + hidden_states_155 = None + hidden_states_157 = torch._C._nn.linear( + hidden_states_156, + l_self_modules_encoder_modules_layer_modules_19_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_19_modules_output_modules_dense_parameters_bias_, + ) + hidden_states_156 = l_self_modules_encoder_modules_layer_modules_19_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_19_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_158 = torch.nn.functional.dropout( + hidden_states_157, 0.1, False, False + ) + hidden_states_157 = None + add_42 = hidden_states_158 + hidden_states_154 + hidden_states_158 = hidden_states_154 = None + hidden_states_159 = torch.nn.functional.layer_norm( + add_42, + (1024,), + l_self_modules_encoder_modules_layer_modules_19_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_19_modules_output_modules_layer_norm_parameters_bias_, + 1e-05, + ) + add_42 = l_self_modules_encoder_modules_layer_modules_19_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_19_modules_output_modules_layer_norm_parameters_bias_ = (None) + linear_120 = torch._C._nn.linear( + hidden_states_159, + l_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_self_modules_query_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_self_modules_query_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_self_modules_query_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_self_modules_query_parameters_bias_ = (None) + view_60 = linear_120.view(1, -1, 16, 64) + linear_120 = None + query_layer_20 = view_60.transpose(1, 2) + view_60 = None + linear_121 = torch._C._nn.linear( + hidden_states_159, + l_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_self_modules_key_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_self_modules_key_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_self_modules_key_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_self_modules_key_parameters_bias_ = (None) + view_61 = linear_121.view(1, -1, 16, 64) + linear_121 = None + key_layer_20 = view_61.transpose(1, 2) + view_61 = None + linear_122 = torch._C._nn.linear( + hidden_states_159, + l_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_self_modules_value_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_self_modules_value_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_self_modules_value_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_self_modules_value_parameters_bias_ = (None) + view_62 = linear_122.view(1, -1, 16, 64) + linear_122 = None + value_layer_20 = view_62.transpose(1, 2) + view_62 = None + attn_output_60 = torch._C._nn.scaled_dot_product_attention( + query_layer_20, + key_layer_20, + value_layer_20, + attn_mask=extended_attention_mask, + dropout_p=0.0, + is_causal=False, + ) + query_layer_20 = key_layer_20 = value_layer_20 = None + attn_output_61 = attn_output_60.transpose(1, 2) + attn_output_60 = None + attn_output_62 = attn_output_61.reshape(1, 20, 1024) + attn_output_61 = None + hidden_states_160 = torch._C._nn.linear( + attn_output_62, + l_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_output_modules_dense_parameters_bias_, + ) + attn_output_62 = l_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_161 = torch.nn.functional.dropout( + hidden_states_160, 0.1, False, False + ) + hidden_states_160 = None + add_43 = hidden_states_161 + hidden_states_159 + hidden_states_161 = hidden_states_159 = None + hidden_states_162 = torch.nn.functional.layer_norm( + add_43, + (1024,), + l_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_output_modules_layer_norm_parameters_bias_, + 1e-05, + ) + add_43 = l_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = (None) + hidden_states_163 = torch._C._nn.linear( + hidden_states_162, + l_self_modules_encoder_modules_layer_modules_20_modules_intermediate_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_20_modules_intermediate_modules_dense_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_20_modules_intermediate_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_20_modules_intermediate_modules_dense_parameters_bias_ = (None) + hidden_states_164 = torch._C._nn.gelu(hidden_states_163) + hidden_states_163 = None + hidden_states_165 = torch._C._nn.linear( + hidden_states_164, + l_self_modules_encoder_modules_layer_modules_20_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_20_modules_output_modules_dense_parameters_bias_, + ) + hidden_states_164 = l_self_modules_encoder_modules_layer_modules_20_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_20_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_166 = torch.nn.functional.dropout( + hidden_states_165, 0.1, False, False + ) + hidden_states_165 = None + add_44 = hidden_states_166 + hidden_states_162 + hidden_states_166 = hidden_states_162 = None + hidden_states_167 = torch.nn.functional.layer_norm( + add_44, + (1024,), + l_self_modules_encoder_modules_layer_modules_20_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_20_modules_output_modules_layer_norm_parameters_bias_, + 1e-05, + ) + add_44 = l_self_modules_encoder_modules_layer_modules_20_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_20_modules_output_modules_layer_norm_parameters_bias_ = (None) + linear_126 = torch._C._nn.linear( + hidden_states_167, + l_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_self_modules_query_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_self_modules_query_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_self_modules_query_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_self_modules_query_parameters_bias_ = (None) + view_63 = linear_126.view(1, -1, 16, 64) + linear_126 = None + query_layer_21 = view_63.transpose(1, 2) + view_63 = None + linear_127 = torch._C._nn.linear( + hidden_states_167, + l_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_self_modules_key_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_self_modules_key_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_self_modules_key_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_self_modules_key_parameters_bias_ = (None) + view_64 = linear_127.view(1, -1, 16, 64) + linear_127 = None + key_layer_21 = view_64.transpose(1, 2) + view_64 = None + linear_128 = torch._C._nn.linear( + hidden_states_167, + l_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_self_modules_value_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_self_modules_value_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_self_modules_value_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_self_modules_value_parameters_bias_ = (None) + view_65 = linear_128.view(1, -1, 16, 64) + linear_128 = None + value_layer_21 = view_65.transpose(1, 2) + view_65 = None + attn_output_63 = torch._C._nn.scaled_dot_product_attention( + query_layer_21, + key_layer_21, + value_layer_21, + attn_mask=extended_attention_mask, + dropout_p=0.0, + is_causal=False, + ) + query_layer_21 = key_layer_21 = value_layer_21 = None + attn_output_64 = attn_output_63.transpose(1, 2) + attn_output_63 = None + attn_output_65 = attn_output_64.reshape(1, 20, 1024) + attn_output_64 = None + hidden_states_168 = torch._C._nn.linear( + attn_output_65, + l_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_output_modules_dense_parameters_bias_, + ) + attn_output_65 = l_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_169 = torch.nn.functional.dropout( + hidden_states_168, 0.1, False, False + ) + hidden_states_168 = None + add_45 = hidden_states_169 + hidden_states_167 + hidden_states_169 = hidden_states_167 = None + hidden_states_170 = torch.nn.functional.layer_norm( + add_45, + (1024,), + l_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_output_modules_layer_norm_parameters_bias_, + 1e-05, + ) + add_45 = l_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = (None) + hidden_states_171 = torch._C._nn.linear( + hidden_states_170, + l_self_modules_encoder_modules_layer_modules_21_modules_intermediate_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_21_modules_intermediate_modules_dense_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_21_modules_intermediate_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_21_modules_intermediate_modules_dense_parameters_bias_ = (None) + hidden_states_172 = torch._C._nn.gelu(hidden_states_171) + hidden_states_171 = None + hidden_states_173 = torch._C._nn.linear( + hidden_states_172, + l_self_modules_encoder_modules_layer_modules_21_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_21_modules_output_modules_dense_parameters_bias_, + ) + hidden_states_172 = l_self_modules_encoder_modules_layer_modules_21_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_21_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_174 = torch.nn.functional.dropout( + hidden_states_173, 0.1, False, False + ) + hidden_states_173 = None + add_46 = hidden_states_174 + hidden_states_170 + hidden_states_174 = hidden_states_170 = None + hidden_states_175 = torch.nn.functional.layer_norm( + add_46, + (1024,), + l_self_modules_encoder_modules_layer_modules_21_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_21_modules_output_modules_layer_norm_parameters_bias_, + 1e-05, + ) + add_46 = l_self_modules_encoder_modules_layer_modules_21_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_21_modules_output_modules_layer_norm_parameters_bias_ = (None) + linear_132 = torch._C._nn.linear( + hidden_states_175, + l_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_self_modules_query_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_self_modules_query_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_self_modules_query_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_self_modules_query_parameters_bias_ = (None) + view_66 = linear_132.view(1, -1, 16, 64) + linear_132 = None + query_layer_22 = view_66.transpose(1, 2) + view_66 = None + linear_133 = torch._C._nn.linear( + hidden_states_175, + l_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_self_modules_key_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_self_modules_key_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_self_modules_key_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_self_modules_key_parameters_bias_ = (None) + view_67 = linear_133.view(1, -1, 16, 64) + linear_133 = None + key_layer_22 = view_67.transpose(1, 2) + view_67 = None + linear_134 = torch._C._nn.linear( + hidden_states_175, + l_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_self_modules_value_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_self_modules_value_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_self_modules_value_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_self_modules_value_parameters_bias_ = (None) + view_68 = linear_134.view(1, -1, 16, 64) + linear_134 = None + value_layer_22 = view_68.transpose(1, 2) + view_68 = None + attn_output_66 = torch._C._nn.scaled_dot_product_attention( + query_layer_22, + key_layer_22, + value_layer_22, + attn_mask=extended_attention_mask, + dropout_p=0.0, + is_causal=False, + ) + query_layer_22 = key_layer_22 = value_layer_22 = None + attn_output_67 = attn_output_66.transpose(1, 2) + attn_output_66 = None + attn_output_68 = attn_output_67.reshape(1, 20, 1024) + attn_output_67 = None + hidden_states_176 = torch._C._nn.linear( + attn_output_68, + l_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_output_modules_dense_parameters_bias_, + ) + attn_output_68 = l_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_177 = torch.nn.functional.dropout( + hidden_states_176, 0.1, False, False + ) + hidden_states_176 = None + add_47 = hidden_states_177 + hidden_states_175 + hidden_states_177 = hidden_states_175 = None + hidden_states_178 = torch.nn.functional.layer_norm( + add_47, + (1024,), + l_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_output_modules_layer_norm_parameters_bias_, + 1e-05, + ) + add_47 = l_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = (None) + hidden_states_179 = torch._C._nn.linear( + hidden_states_178, + l_self_modules_encoder_modules_layer_modules_22_modules_intermediate_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_22_modules_intermediate_modules_dense_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_22_modules_intermediate_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_22_modules_intermediate_modules_dense_parameters_bias_ = (None) + hidden_states_180 = torch._C._nn.gelu(hidden_states_179) + hidden_states_179 = None + hidden_states_181 = torch._C._nn.linear( + hidden_states_180, + l_self_modules_encoder_modules_layer_modules_22_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_22_modules_output_modules_dense_parameters_bias_, + ) + hidden_states_180 = l_self_modules_encoder_modules_layer_modules_22_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_22_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_182 = torch.nn.functional.dropout( + hidden_states_181, 0.1, False, False + ) + hidden_states_181 = None + add_48 = hidden_states_182 + hidden_states_178 + hidden_states_182 = hidden_states_178 = None + hidden_states_183 = torch.nn.functional.layer_norm( + add_48, + (1024,), + l_self_modules_encoder_modules_layer_modules_22_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_22_modules_output_modules_layer_norm_parameters_bias_, + 1e-05, + ) + add_48 = l_self_modules_encoder_modules_layer_modules_22_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_22_modules_output_modules_layer_norm_parameters_bias_ = (None) + linear_138 = torch._C._nn.linear( + hidden_states_183, + l_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_self_modules_query_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_self_modules_query_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_self_modules_query_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_self_modules_query_parameters_bias_ = (None) + view_69 = linear_138.view(1, -1, 16, 64) + linear_138 = None + query_layer_23 = view_69.transpose(1, 2) + view_69 = None + linear_139 = torch._C._nn.linear( + hidden_states_183, + l_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_self_modules_key_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_self_modules_key_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_self_modules_key_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_self_modules_key_parameters_bias_ = (None) + view_70 = linear_139.view(1, -1, 16, 64) + linear_139 = None + key_layer_23 = view_70.transpose(1, 2) + view_70 = None + linear_140 = torch._C._nn.linear( + hidden_states_183, + l_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_self_modules_value_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_self_modules_value_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_self_modules_value_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_self_modules_value_parameters_bias_ = (None) + view_71 = linear_140.view(1, -1, 16, 64) + linear_140 = None + value_layer_23 = view_71.transpose(1, 2) + view_71 = None + attn_output_69 = torch._C._nn.scaled_dot_product_attention( + query_layer_23, + key_layer_23, + value_layer_23, + attn_mask=extended_attention_mask, + dropout_p=0.0, + is_causal=False, + ) + query_layer_23 = key_layer_23 = value_layer_23 = extended_attention_mask = None + attn_output_70 = attn_output_69.transpose(1, 2) + attn_output_69 = None + attn_output_71 = attn_output_70.reshape(1, 20, 1024) + attn_output_70 = None + hidden_states_184 = torch._C._nn.linear( + attn_output_71, + l_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_output_modules_dense_parameters_bias_, + ) + attn_output_71 = l_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_185 = torch.nn.functional.dropout( + hidden_states_184, 0.1, False, False + ) + hidden_states_184 = None + add_49 = hidden_states_185 + hidden_states_183 + hidden_states_185 = hidden_states_183 = None + hidden_states_186 = torch.nn.functional.layer_norm( + add_49, + (1024,), + l_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_output_modules_layer_norm_parameters_bias_, + 1e-05, + ) + add_49 = l_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = (None) + hidden_states_187 = torch._C._nn.linear( + hidden_states_186, + l_self_modules_encoder_modules_layer_modules_23_modules_intermediate_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_23_modules_intermediate_modules_dense_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_23_modules_intermediate_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_23_modules_intermediate_modules_dense_parameters_bias_ = (None) + hidden_states_188 = torch._C._nn.gelu(hidden_states_187) + hidden_states_187 = None + hidden_states_189 = torch._C._nn.linear( + hidden_states_188, + l_self_modules_encoder_modules_layer_modules_23_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_23_modules_output_modules_dense_parameters_bias_, + ) + hidden_states_188 = l_self_modules_encoder_modules_layer_modules_23_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_23_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_190 = torch.nn.functional.dropout( + hidden_states_189, 0.1, False, False + ) + hidden_states_189 = None + add_50 = hidden_states_190 + hidden_states_186 + hidden_states_190 = hidden_states_186 = None + hidden_states_191 = torch.nn.functional.layer_norm( + add_50, + (1024,), + l_self_modules_encoder_modules_layer_modules_23_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_23_modules_output_modules_layer_norm_parameters_bias_, + 1e-05, + ) + add_50 = l_self_modules_encoder_modules_layer_modules_23_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_23_modules_output_modules_layer_norm_parameters_bias_ = (None) + first_token_tensor = hidden_states_191[(slice(None, None, None), 0)] + pooled_output = torch._C._nn.linear( + first_token_tensor, + l_self_modules_pooler_modules_dense_parameters_weight_, + l_self_modules_pooler_modules_dense_parameters_bias_, + ) + first_token_tensor = ( + l_self_modules_pooler_modules_dense_parameters_weight_ + ) = l_self_modules_pooler_modules_dense_parameters_bias_ = None + pooled_output_1 = torch.tanh(pooled_output) + pooled_output = None + return (hidden_states_191, pooled_output_1) diff --git a/samples/transformers-auto-model/Data-Lab_ruRoberta-large_classification_v0.2/weight_meta.py b/samples/transformers-auto-model/Data-Lab_ruRoberta-large_classification_v0.2/weight_meta.py new file mode 100644 index 000000000..cbd8ed2db --- /dev/null +++ b/samples/transformers-auto-model/Data-Lab_ruRoberta-large_classification_v0.2/weight_meta.py @@ -0,0 +1,3960 @@ +class Program_weight_tensor_meta_L_input_ids_: + name = "L_input_ids_" + shape = [1, 20] + dtype = "torch.int64" + device = "cuda:0" + mean = None + std = None + data = [ + 1, + 43, + 36120, + 50, + 3150, + 16710, + 33958, + 3644, + 5913, + 19493, + 10959, + 18445, + 47235, + 5627, + 19429, + 6340, + 36120, + 87, + 18, + 2, + ] + + +class Program_weight_tensor_meta_L_self_modules_embeddings_buffers_token_type_ids_: + name = "L_self_modules_embeddings_buffers_token_type_ids_" + shape = [1, 514] + dtype = "torch.int64" + device = "cuda:0" + mean = None + std = None + min_val = 0 + max_val = 0 + + +class Program_weight_tensor_meta_L_self_modules_embeddings_modules_word_embeddings_parameters_weight_: + name = "L_self_modules_embeddings_modules_word_embeddings_parameters_weight_" + shape = [50265, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_embeddings_modules_token_type_embeddings_parameters_weight_: + name = "L_self_modules_embeddings_modules_token_type_embeddings_parameters_weight_" + shape = [1, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.001 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_embeddings_modules_position_embeddings_parameters_weight_: + name = "L_self_modules_embeddings_modules_position_embeddings_parameters_weight_" + shape = [514, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_embeddings_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_embeddings_modules_LayerNorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_embeddings_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_embeddings_modules_LayerNorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_attention_mask_: + name = "L_attention_mask_" + shape = [1, 20] + dtype = "torch.int64" + device = "cuda:0" + mean = None + std = None + data = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_query_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_query_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_query_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_query_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_key_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_key_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_key_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_key_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_value_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_value_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_value_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_value_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dense_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_LayerNorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_LayerNorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_intermediate_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_intermediate_modules_dense_parameters_weight_" + shape = [4096, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_intermediate_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_intermediate_modules_dense_parameters_bias_" + shape = [4096] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dense_parameters_weight_" + shape = [1024, 4096] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_LayerNorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_LayerNorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_query_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_query_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_query_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_query_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_key_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_key_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_key_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_key_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_value_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_value_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_value_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_value_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dense_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_LayerNorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_LayerNorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_intermediate_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_intermediate_modules_dense_parameters_weight_" + shape = [4096, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_intermediate_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_intermediate_modules_dense_parameters_bias_" + shape = [4096] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dense_parameters_weight_" + shape = [1024, 4096] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_LayerNorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_LayerNorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_query_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_query_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_query_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_query_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_key_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_key_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_key_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_key_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_value_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_value_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_value_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_value_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_dense_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_LayerNorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_LayerNorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_intermediate_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_intermediate_modules_dense_parameters_weight_" + shape = [4096, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_intermediate_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_intermediate_modules_dense_parameters_bias_" + shape = [4096] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_dense_parameters_weight_" + shape = [1024, 4096] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_LayerNorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_LayerNorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_query_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_query_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_query_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_query_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_key_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_key_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_key_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_key_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_value_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_value_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_value_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_value_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_dense_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_LayerNorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_LayerNorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_intermediate_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_intermediate_modules_dense_parameters_weight_" + shape = [4096, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_intermediate_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_intermediate_modules_dense_parameters_bias_" + shape = [4096] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_dense_parameters_weight_" + shape = [1024, 4096] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_LayerNorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_LayerNorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_query_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_query_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_query_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_query_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_key_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_key_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_key_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_key_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_value_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_value_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_value_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_value_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_dense_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_LayerNorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_LayerNorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_4_modules_intermediate_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_4_modules_intermediate_modules_dense_parameters_weight_" + shape = [4096, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_4_modules_intermediate_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_4_modules_intermediate_modules_dense_parameters_bias_" + shape = [4096] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_4_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_4_modules_output_modules_dense_parameters_weight_" + shape = [1024, 4096] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_4_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_4_modules_output_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_4_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_4_modules_output_modules_LayerNorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_4_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_4_modules_output_modules_LayerNorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_query_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_query_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_query_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_query_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_key_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_key_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_key_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_key_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_value_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_value_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_value_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_value_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_dense_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_LayerNorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_LayerNorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_5_modules_intermediate_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_5_modules_intermediate_modules_dense_parameters_weight_" + shape = [4096, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_5_modules_intermediate_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_5_modules_intermediate_modules_dense_parameters_bias_" + shape = [4096] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_5_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_5_modules_output_modules_dense_parameters_weight_" + shape = [1024, 4096] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_5_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_5_modules_output_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_5_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_5_modules_output_modules_LayerNorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_5_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_5_modules_output_modules_LayerNorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_query_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_query_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_query_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_query_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_key_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_key_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_key_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_key_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_value_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_value_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_value_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_value_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_dense_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_LayerNorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_LayerNorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_6_modules_intermediate_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_6_modules_intermediate_modules_dense_parameters_weight_" + shape = [4096, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_6_modules_intermediate_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_6_modules_intermediate_modules_dense_parameters_bias_" + shape = [4096] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_6_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_6_modules_output_modules_dense_parameters_weight_" + shape = [1024, 4096] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_6_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_6_modules_output_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_6_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_6_modules_output_modules_LayerNorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_6_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_6_modules_output_modules_LayerNorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_query_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_query_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_query_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_query_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_key_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_key_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_key_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_key_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_value_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_value_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_value_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_value_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_dense_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_LayerNorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_LayerNorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_7_modules_intermediate_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_7_modules_intermediate_modules_dense_parameters_weight_" + shape = [4096, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_7_modules_intermediate_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_7_modules_intermediate_modules_dense_parameters_bias_" + shape = [4096] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_7_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_7_modules_output_modules_dense_parameters_weight_" + shape = [1024, 4096] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_7_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_7_modules_output_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_7_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_7_modules_output_modules_LayerNorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_7_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_7_modules_output_modules_LayerNorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_query_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_query_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_query_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_query_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_key_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_key_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_key_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_key_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_value_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_value_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_value_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_value_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_dense_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_LayerNorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_LayerNorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_8_modules_intermediate_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_8_modules_intermediate_modules_dense_parameters_weight_" + shape = [4096, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_8_modules_intermediate_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_8_modules_intermediate_modules_dense_parameters_bias_" + shape = [4096] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_8_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_8_modules_output_modules_dense_parameters_weight_" + shape = [1024, 4096] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_8_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_8_modules_output_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_8_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_8_modules_output_modules_LayerNorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_8_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_8_modules_output_modules_LayerNorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_query_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_query_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_query_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_query_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_key_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_key_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_key_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_key_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_value_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_value_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_value_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_value_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_dense_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_LayerNorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_LayerNorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_9_modules_intermediate_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_9_modules_intermediate_modules_dense_parameters_weight_" + shape = [4096, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_9_modules_intermediate_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_9_modules_intermediate_modules_dense_parameters_bias_" + shape = [4096] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_9_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_9_modules_output_modules_dense_parameters_weight_" + shape = [1024, 4096] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_9_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_9_modules_output_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_9_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_9_modules_output_modules_LayerNorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_9_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_9_modules_output_modules_LayerNorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_query_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_query_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_query_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_query_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_key_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_key_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_key_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_key_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_value_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_value_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_value_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_value_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_dense_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_LayerNorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_LayerNorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_10_modules_intermediate_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_10_modules_intermediate_modules_dense_parameters_weight_" + shape = [4096, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_10_modules_intermediate_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_10_modules_intermediate_modules_dense_parameters_bias_" + shape = [4096] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_10_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_10_modules_output_modules_dense_parameters_weight_" + shape = [1024, 4096] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_10_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_10_modules_output_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_10_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_10_modules_output_modules_LayerNorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_10_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_10_modules_output_modules_LayerNorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_query_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_query_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_query_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_query_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_key_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_key_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_key_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_key_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_value_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_value_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_value_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_value_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_dense_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_LayerNorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_LayerNorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_11_modules_intermediate_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_11_modules_intermediate_modules_dense_parameters_weight_" + shape = [4096, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_11_modules_intermediate_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_11_modules_intermediate_modules_dense_parameters_bias_" + shape = [4096] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_11_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_11_modules_output_modules_dense_parameters_weight_" + shape = [1024, 4096] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_11_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_11_modules_output_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_11_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_11_modules_output_modules_LayerNorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_11_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_11_modules_output_modules_LayerNorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_self_modules_query_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_self_modules_query_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_self_modules_query_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_self_modules_query_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_self_modules_key_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_self_modules_key_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_self_modules_key_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_self_modules_key_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_self_modules_value_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_self_modules_value_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_self_modules_value_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_self_modules_value_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_output_modules_dense_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_output_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_output_modules_LayerNorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_output_modules_LayerNorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_12_modules_intermediate_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_12_modules_intermediate_modules_dense_parameters_weight_" + shape = [4096, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_12_modules_intermediate_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_12_modules_intermediate_modules_dense_parameters_bias_" + shape = [4096] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_12_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_12_modules_output_modules_dense_parameters_weight_" + shape = [1024, 4096] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_12_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_12_modules_output_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_12_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_12_modules_output_modules_LayerNorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_12_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_12_modules_output_modules_LayerNorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_self_modules_query_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_self_modules_query_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_self_modules_query_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_self_modules_query_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_self_modules_key_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_self_modules_key_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_self_modules_key_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_self_modules_key_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_self_modules_value_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_self_modules_value_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_self_modules_value_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_self_modules_value_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_output_modules_dense_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_output_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_output_modules_LayerNorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_output_modules_LayerNorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_13_modules_intermediate_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_13_modules_intermediate_modules_dense_parameters_weight_" + shape = [4096, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_13_modules_intermediate_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_13_modules_intermediate_modules_dense_parameters_bias_" + shape = [4096] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_13_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_13_modules_output_modules_dense_parameters_weight_" + shape = [1024, 4096] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_13_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_13_modules_output_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_13_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_13_modules_output_modules_LayerNorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_13_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_13_modules_output_modules_LayerNorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_self_modules_query_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_self_modules_query_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_self_modules_query_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_self_modules_query_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_self_modules_key_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_self_modules_key_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_self_modules_key_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_self_modules_key_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_self_modules_value_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_self_modules_value_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_self_modules_value_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_self_modules_value_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_output_modules_dense_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_output_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_output_modules_LayerNorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_output_modules_LayerNorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_14_modules_intermediate_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_14_modules_intermediate_modules_dense_parameters_weight_" + shape = [4096, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_14_modules_intermediate_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_14_modules_intermediate_modules_dense_parameters_bias_" + shape = [4096] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_14_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_14_modules_output_modules_dense_parameters_weight_" + shape = [1024, 4096] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_14_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_14_modules_output_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_14_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_14_modules_output_modules_LayerNorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_14_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_14_modules_output_modules_LayerNorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_self_modules_query_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_self_modules_query_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_self_modules_query_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_self_modules_query_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_self_modules_key_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_self_modules_key_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_self_modules_key_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_self_modules_key_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_self_modules_value_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_self_modules_value_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_self_modules_value_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_self_modules_value_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_output_modules_dense_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_output_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_output_modules_LayerNorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_output_modules_LayerNorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_15_modules_intermediate_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_15_modules_intermediate_modules_dense_parameters_weight_" + shape = [4096, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_15_modules_intermediate_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_15_modules_intermediate_modules_dense_parameters_bias_" + shape = [4096] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_15_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_15_modules_output_modules_dense_parameters_weight_" + shape = [1024, 4096] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_15_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_15_modules_output_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_15_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_15_modules_output_modules_LayerNorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_15_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_15_modules_output_modules_LayerNorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_self_modules_query_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_self_modules_query_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_self_modules_query_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_self_modules_query_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_self_modules_key_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_self_modules_key_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_self_modules_key_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_self_modules_key_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_self_modules_value_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_self_modules_value_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_self_modules_value_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_self_modules_value_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_output_modules_dense_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_output_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_output_modules_LayerNorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_output_modules_LayerNorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_16_modules_intermediate_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_16_modules_intermediate_modules_dense_parameters_weight_" + shape = [4096, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_16_modules_intermediate_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_16_modules_intermediate_modules_dense_parameters_bias_" + shape = [4096] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_16_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_16_modules_output_modules_dense_parameters_weight_" + shape = [1024, 4096] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_16_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_16_modules_output_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_16_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_16_modules_output_modules_LayerNorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_16_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_16_modules_output_modules_LayerNorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_self_modules_query_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_self_modules_query_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_self_modules_query_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_self_modules_query_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_self_modules_key_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_self_modules_key_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_self_modules_key_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_self_modules_key_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_self_modules_value_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_self_modules_value_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_self_modules_value_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_self_modules_value_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_output_modules_dense_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_output_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_output_modules_LayerNorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_output_modules_LayerNorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_17_modules_intermediate_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_17_modules_intermediate_modules_dense_parameters_weight_" + shape = [4096, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_17_modules_intermediate_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_17_modules_intermediate_modules_dense_parameters_bias_" + shape = [4096] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_17_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_17_modules_output_modules_dense_parameters_weight_" + shape = [1024, 4096] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_17_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_17_modules_output_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_17_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_17_modules_output_modules_LayerNorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_17_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_17_modules_output_modules_LayerNorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_self_modules_query_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_self_modules_query_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_self_modules_query_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_self_modules_query_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_self_modules_key_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_self_modules_key_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_self_modules_key_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_self_modules_key_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_self_modules_value_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_self_modules_value_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_self_modules_value_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_self_modules_value_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_output_modules_dense_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_output_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_output_modules_LayerNorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_output_modules_LayerNorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_18_modules_intermediate_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_18_modules_intermediate_modules_dense_parameters_weight_" + shape = [4096, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_18_modules_intermediate_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_18_modules_intermediate_modules_dense_parameters_bias_" + shape = [4096] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_18_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_18_modules_output_modules_dense_parameters_weight_" + shape = [1024, 4096] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_18_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_18_modules_output_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_18_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_18_modules_output_modules_LayerNorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_18_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_18_modules_output_modules_LayerNorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_self_modules_query_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_self_modules_query_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_self_modules_query_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_self_modules_query_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_self_modules_key_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_self_modules_key_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_self_modules_key_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_self_modules_key_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_self_modules_value_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_self_modules_value_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_self_modules_value_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_self_modules_value_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_output_modules_dense_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_output_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_output_modules_LayerNorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_output_modules_LayerNorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_19_modules_intermediate_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_19_modules_intermediate_modules_dense_parameters_weight_" + shape = [4096, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_19_modules_intermediate_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_19_modules_intermediate_modules_dense_parameters_bias_" + shape = [4096] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_19_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_19_modules_output_modules_dense_parameters_weight_" + shape = [1024, 4096] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_19_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_19_modules_output_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_19_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_19_modules_output_modules_LayerNorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_19_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_19_modules_output_modules_LayerNorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_self_modules_query_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_self_modules_query_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_self_modules_query_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_self_modules_query_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_self_modules_key_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_self_modules_key_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_self_modules_key_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_self_modules_key_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_self_modules_value_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_self_modules_value_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_self_modules_value_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_self_modules_value_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_output_modules_dense_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_output_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_output_modules_LayerNorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_output_modules_LayerNorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_20_modules_intermediate_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_20_modules_intermediate_modules_dense_parameters_weight_" + shape = [4096, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_20_modules_intermediate_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_20_modules_intermediate_modules_dense_parameters_bias_" + shape = [4096] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_20_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_20_modules_output_modules_dense_parameters_weight_" + shape = [1024, 4096] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_20_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_20_modules_output_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_20_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_20_modules_output_modules_LayerNorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_20_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_20_modules_output_modules_LayerNorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_self_modules_query_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_self_modules_query_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_self_modules_query_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_self_modules_query_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_self_modules_key_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_self_modules_key_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_self_modules_key_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_self_modules_key_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_self_modules_value_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_self_modules_value_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_self_modules_value_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_self_modules_value_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_output_modules_dense_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_output_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_output_modules_LayerNorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_output_modules_LayerNorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_21_modules_intermediate_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_21_modules_intermediate_modules_dense_parameters_weight_" + shape = [4096, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_21_modules_intermediate_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_21_modules_intermediate_modules_dense_parameters_bias_" + shape = [4096] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_21_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_21_modules_output_modules_dense_parameters_weight_" + shape = [1024, 4096] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_21_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_21_modules_output_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_21_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_21_modules_output_modules_LayerNorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_21_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_21_modules_output_modules_LayerNorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_self_modules_query_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_self_modules_query_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_self_modules_query_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_self_modules_query_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_self_modules_key_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_self_modules_key_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_self_modules_key_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_self_modules_key_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_self_modules_value_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_self_modules_value_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_self_modules_value_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_self_modules_value_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_output_modules_dense_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_output_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_output_modules_LayerNorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_output_modules_LayerNorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_22_modules_intermediate_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_22_modules_intermediate_modules_dense_parameters_weight_" + shape = [4096, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_22_modules_intermediate_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_22_modules_intermediate_modules_dense_parameters_bias_" + shape = [4096] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_22_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_22_modules_output_modules_dense_parameters_weight_" + shape = [1024, 4096] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_22_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_22_modules_output_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_22_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_22_modules_output_modules_LayerNorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_22_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_22_modules_output_modules_LayerNorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_self_modules_query_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_self_modules_query_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_self_modules_query_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_self_modules_query_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_self_modules_key_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_self_modules_key_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_self_modules_key_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_self_modules_key_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_self_modules_value_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_self_modules_value_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_self_modules_value_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_self_modules_value_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_output_modules_dense_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_output_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_output_modules_LayerNorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_output_modules_LayerNorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_23_modules_intermediate_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_23_modules_intermediate_modules_dense_parameters_weight_" + shape = [4096, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_23_modules_intermediate_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_23_modules_intermediate_modules_dense_parameters_bias_" + shape = [4096] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_23_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_23_modules_output_modules_dense_parameters_weight_" + shape = [1024, 4096] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_23_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_23_modules_output_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_23_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_23_modules_output_modules_LayerNorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_23_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_23_modules_output_modules_LayerNorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_pooler_modules_dense_parameters_weight_: + name = "L_self_modules_pooler_modules_dense_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_pooler_modules_dense_parameters_bias_: + name = "L_self_modules_pooler_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None diff --git a/samples/transformers-auto-model/Onutoa_1_6e-3_5_0.5/graph_hash.txt b/samples/transformers-auto-model/Onutoa_1_6e-3_5_0.5/graph_hash.txt new file mode 100644 index 000000000..f98761b62 --- /dev/null +++ b/samples/transformers-auto-model/Onutoa_1_6e-3_5_0.5/graph_hash.txt @@ -0,0 +1 @@ +46cda4888455c7efb6855ddface180e12d0bed7c5e0ebea4d107944dfac3df9e \ No newline at end of file diff --git a/samples/transformers-auto-model/Onutoa_1_6e-3_5_0.5/graph_net.json b/samples/transformers-auto-model/Onutoa_1_6e-3_5_0.5/graph_net.json new file mode 100644 index 000000000..b6ffe9f72 --- /dev/null +++ b/samples/transformers-auto-model/Onutoa_1_6e-3_5_0.5/graph_net.json @@ -0,0 +1,6 @@ +{ + "framework": "torch", + "num_devices_required": 1, + "num_nodes_required": 1, + "dynamic": false +} \ No newline at end of file diff --git a/samples/transformers-auto-model/Onutoa_1_6e-3_5_0.5/input_meta.py b/samples/transformers-auto-model/Onutoa_1_6e-3_5_0.5/input_meta.py new file mode 100644 index 000000000..e69de29bb diff --git a/samples/transformers-auto-model/Onutoa_1_6e-3_5_0.5/input_tensor_constraints.py b/samples/transformers-auto-model/Onutoa_1_6e-3_5_0.5/input_tensor_constraints.py new file mode 100644 index 000000000..e69de29bb diff --git a/samples/transformers-auto-model/Onutoa_1_6e-3_5_0.5/model.py b/samples/transformers-auto-model/Onutoa_1_6e-3_5_0.5/model.py new file mode 100644 index 000000000..f2bb993e6 --- /dev/null +++ b/samples/transformers-auto-model/Onutoa_1_6e-3_5_0.5/model.py @@ -0,0 +1,3080 @@ +import torch + + +class GraphModule(torch.nn.Module): + def forward( + self, + L_input_ids_: torch.Tensor, + L_token_type_ids_: torch.Tensor, + L_self_modules_embeddings_buffers_position_ids_: torch.Tensor, + L_self_modules_embeddings_modules_word_embeddings_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_embeddings_modules_token_type_embeddings_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_embeddings_modules_position_embeddings_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_embeddings_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_embeddings_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_attention_mask_: torch.Tensor, + L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_query_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_query_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_key_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_key_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_intermediate_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_intermediate_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_query_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_query_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_key_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_key_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_intermediate_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_intermediate_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_query_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_query_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_key_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_key_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_intermediate_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_intermediate_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_query_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_query_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_key_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_key_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_intermediate_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_intermediate_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_query_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_query_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_key_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_key_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_4_modules_intermediate_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_4_modules_intermediate_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_4_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_4_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_4_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_4_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_query_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_query_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_key_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_key_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_5_modules_intermediate_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_5_modules_intermediate_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_5_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_5_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_5_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_5_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_query_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_query_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_key_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_key_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_6_modules_intermediate_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_6_modules_intermediate_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_6_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_6_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_6_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_6_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_query_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_query_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_key_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_key_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_7_modules_intermediate_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_7_modules_intermediate_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_7_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_7_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_7_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_7_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_query_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_query_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_key_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_key_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_8_modules_intermediate_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_8_modules_intermediate_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_8_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_8_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_8_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_8_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_query_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_query_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_key_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_key_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_9_modules_intermediate_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_9_modules_intermediate_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_9_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_9_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_9_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_9_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_query_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_query_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_key_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_key_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_10_modules_intermediate_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_10_modules_intermediate_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_10_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_10_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_10_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_10_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_query_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_query_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_key_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_key_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_11_modules_intermediate_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_11_modules_intermediate_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_11_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_11_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_11_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_11_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_self_modules_query_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_self_modules_query_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_self_modules_key_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_self_modules_key_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_self_modules_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_self_modules_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_12_modules_intermediate_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_12_modules_intermediate_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_12_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_12_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_12_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_12_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_self_modules_query_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_self_modules_query_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_self_modules_key_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_self_modules_key_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_self_modules_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_self_modules_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_13_modules_intermediate_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_13_modules_intermediate_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_13_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_13_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_13_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_13_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_self_modules_query_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_self_modules_query_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_self_modules_key_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_self_modules_key_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_self_modules_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_self_modules_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_14_modules_intermediate_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_14_modules_intermediate_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_14_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_14_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_14_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_14_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_self_modules_query_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_self_modules_query_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_self_modules_key_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_self_modules_key_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_self_modules_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_self_modules_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_15_modules_intermediate_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_15_modules_intermediate_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_15_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_15_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_15_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_15_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_self_modules_query_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_self_modules_query_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_self_modules_key_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_self_modules_key_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_self_modules_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_self_modules_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_16_modules_intermediate_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_16_modules_intermediate_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_16_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_16_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_16_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_16_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_self_modules_query_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_self_modules_query_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_self_modules_key_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_self_modules_key_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_self_modules_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_self_modules_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_17_modules_intermediate_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_17_modules_intermediate_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_17_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_17_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_17_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_17_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_self_modules_query_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_self_modules_query_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_self_modules_key_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_self_modules_key_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_self_modules_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_self_modules_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_18_modules_intermediate_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_18_modules_intermediate_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_18_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_18_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_18_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_18_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_self_modules_query_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_self_modules_query_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_self_modules_key_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_self_modules_key_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_self_modules_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_self_modules_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_19_modules_intermediate_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_19_modules_intermediate_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_19_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_19_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_19_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_19_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_self_modules_query_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_self_modules_query_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_self_modules_key_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_self_modules_key_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_self_modules_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_self_modules_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_20_modules_intermediate_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_20_modules_intermediate_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_20_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_20_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_20_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_20_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_self_modules_query_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_self_modules_query_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_self_modules_key_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_self_modules_key_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_self_modules_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_self_modules_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_21_modules_intermediate_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_21_modules_intermediate_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_21_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_21_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_21_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_21_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_self_modules_query_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_self_modules_query_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_self_modules_key_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_self_modules_key_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_self_modules_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_self_modules_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_22_modules_intermediate_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_22_modules_intermediate_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_22_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_22_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_22_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_22_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_self_modules_query_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_self_modules_query_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_self_modules_key_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_self_modules_key_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_self_modules_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_self_modules_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_23_modules_intermediate_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_23_modules_intermediate_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_23_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_23_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_23_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_23_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_pooler_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_pooler_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + ): + l_input_ids_ = L_input_ids_ + l_token_type_ids_ = L_token_type_ids_ + l_self_modules_embeddings_buffers_position_ids_ = ( + L_self_modules_embeddings_buffers_position_ids_ + ) + l_self_modules_embeddings_modules_word_embeddings_parameters_weight_ = ( + L_self_modules_embeddings_modules_word_embeddings_parameters_weight_ + ) + l_self_modules_embeddings_modules_token_type_embeddings_parameters_weight_ = ( + L_self_modules_embeddings_modules_token_type_embeddings_parameters_weight_ + ) + l_self_modules_embeddings_modules_position_embeddings_parameters_weight_ = ( + L_self_modules_embeddings_modules_position_embeddings_parameters_weight_ + ) + l_self_modules_embeddings_modules_layer_norm_parameters_weight_ = ( + L_self_modules_embeddings_modules_LayerNorm_parameters_weight_ + ) + l_self_modules_embeddings_modules_layer_norm_parameters_bias_ = ( + L_self_modules_embeddings_modules_LayerNorm_parameters_bias_ + ) + l_attention_mask_ = L_attention_mask_ + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_query_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_query_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_query_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_query_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_key_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_key_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_key_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_key_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_value_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_value_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_value_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_value_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_0_modules_intermediate_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_0_modules_intermediate_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_0_modules_intermediate_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_0_modules_intermediate_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_0_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_0_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_query_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_query_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_query_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_query_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_key_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_key_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_key_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_key_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_value_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_value_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_value_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_value_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_1_modules_intermediate_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_1_modules_intermediate_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_1_modules_intermediate_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_1_modules_intermediate_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_1_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_1_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_query_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_query_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_query_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_query_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_key_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_key_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_key_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_key_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_value_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_value_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_value_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_value_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_2_modules_intermediate_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_2_modules_intermediate_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_2_modules_intermediate_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_2_modules_intermediate_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_2_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_2_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_2_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_2_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_query_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_query_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_query_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_query_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_key_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_key_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_key_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_key_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_value_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_value_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_value_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_value_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_3_modules_intermediate_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_3_modules_intermediate_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_3_modules_intermediate_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_3_modules_intermediate_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_3_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_3_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_3_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_3_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_query_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_query_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_query_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_query_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_key_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_key_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_key_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_key_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_value_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_value_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_value_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_value_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_4_modules_intermediate_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_4_modules_intermediate_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_4_modules_intermediate_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_4_modules_intermediate_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_4_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_4_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_4_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_4_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_4_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_4_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_4_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_4_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_query_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_query_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_query_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_query_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_key_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_key_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_key_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_key_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_value_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_value_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_value_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_value_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_5_modules_intermediate_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_5_modules_intermediate_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_5_modules_intermediate_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_5_modules_intermediate_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_5_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_5_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_5_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_5_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_5_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_5_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_5_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_5_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_query_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_query_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_query_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_query_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_key_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_key_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_key_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_key_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_value_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_value_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_value_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_value_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_6_modules_intermediate_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_6_modules_intermediate_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_6_modules_intermediate_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_6_modules_intermediate_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_6_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_6_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_6_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_6_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_6_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_6_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_6_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_6_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_query_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_query_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_query_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_query_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_key_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_key_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_key_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_key_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_value_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_value_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_value_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_value_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_7_modules_intermediate_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_7_modules_intermediate_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_7_modules_intermediate_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_7_modules_intermediate_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_7_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_7_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_7_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_7_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_7_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_7_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_7_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_7_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_query_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_query_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_query_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_query_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_key_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_key_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_key_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_key_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_value_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_value_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_value_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_value_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_8_modules_intermediate_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_8_modules_intermediate_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_8_modules_intermediate_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_8_modules_intermediate_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_8_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_8_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_8_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_8_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_8_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_8_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_8_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_8_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_query_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_query_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_query_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_query_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_key_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_key_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_key_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_key_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_value_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_value_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_value_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_value_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_9_modules_intermediate_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_9_modules_intermediate_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_9_modules_intermediate_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_9_modules_intermediate_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_9_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_9_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_9_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_9_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_9_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_9_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_9_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_9_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_query_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_query_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_query_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_query_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_key_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_key_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_key_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_key_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_value_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_value_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_value_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_value_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_10_modules_intermediate_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_10_modules_intermediate_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_10_modules_intermediate_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_10_modules_intermediate_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_10_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_10_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_10_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_10_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_10_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_10_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_10_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_10_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_query_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_query_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_query_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_query_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_key_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_key_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_key_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_key_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_value_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_value_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_value_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_value_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_11_modules_intermediate_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_11_modules_intermediate_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_11_modules_intermediate_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_11_modules_intermediate_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_11_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_11_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_11_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_11_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_11_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_11_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_11_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_11_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_self_modules_query_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_self_modules_query_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_self_modules_query_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_self_modules_query_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_self_modules_key_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_self_modules_key_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_self_modules_key_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_self_modules_key_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_self_modules_value_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_self_modules_value_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_self_modules_value_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_self_modules_value_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_12_modules_intermediate_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_12_modules_intermediate_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_12_modules_intermediate_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_12_modules_intermediate_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_12_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_12_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_12_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_12_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_12_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_12_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_12_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_12_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_self_modules_query_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_self_modules_query_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_self_modules_query_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_self_modules_query_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_self_modules_key_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_self_modules_key_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_self_modules_key_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_self_modules_key_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_self_modules_value_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_self_modules_value_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_self_modules_value_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_self_modules_value_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_13_modules_intermediate_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_13_modules_intermediate_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_13_modules_intermediate_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_13_modules_intermediate_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_13_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_13_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_13_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_13_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_13_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_13_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_13_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_13_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_self_modules_query_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_self_modules_query_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_self_modules_query_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_self_modules_query_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_self_modules_key_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_self_modules_key_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_self_modules_key_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_self_modules_key_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_self_modules_value_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_self_modules_value_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_self_modules_value_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_self_modules_value_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_14_modules_intermediate_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_14_modules_intermediate_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_14_modules_intermediate_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_14_modules_intermediate_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_14_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_14_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_14_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_14_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_14_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_14_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_14_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_14_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_self_modules_query_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_self_modules_query_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_self_modules_query_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_self_modules_query_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_self_modules_key_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_self_modules_key_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_self_modules_key_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_self_modules_key_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_self_modules_value_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_self_modules_value_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_self_modules_value_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_self_modules_value_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_15_modules_intermediate_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_15_modules_intermediate_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_15_modules_intermediate_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_15_modules_intermediate_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_15_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_15_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_15_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_15_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_15_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_15_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_15_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_15_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_self_modules_query_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_self_modules_query_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_self_modules_query_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_self_modules_query_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_self_modules_key_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_self_modules_key_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_self_modules_key_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_self_modules_key_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_self_modules_value_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_self_modules_value_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_self_modules_value_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_self_modules_value_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_16_modules_intermediate_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_16_modules_intermediate_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_16_modules_intermediate_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_16_modules_intermediate_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_16_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_16_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_16_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_16_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_16_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_16_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_16_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_16_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_self_modules_query_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_self_modules_query_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_self_modules_query_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_self_modules_query_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_self_modules_key_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_self_modules_key_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_self_modules_key_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_self_modules_key_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_self_modules_value_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_self_modules_value_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_self_modules_value_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_self_modules_value_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_17_modules_intermediate_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_17_modules_intermediate_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_17_modules_intermediate_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_17_modules_intermediate_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_17_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_17_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_17_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_17_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_17_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_17_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_17_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_17_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_self_modules_query_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_self_modules_query_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_self_modules_query_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_self_modules_query_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_self_modules_key_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_self_modules_key_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_self_modules_key_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_self_modules_key_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_self_modules_value_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_self_modules_value_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_self_modules_value_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_self_modules_value_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_18_modules_intermediate_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_18_modules_intermediate_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_18_modules_intermediate_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_18_modules_intermediate_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_18_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_18_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_18_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_18_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_18_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_18_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_18_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_18_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_self_modules_query_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_self_modules_query_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_self_modules_query_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_self_modules_query_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_self_modules_key_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_self_modules_key_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_self_modules_key_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_self_modules_key_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_self_modules_value_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_self_modules_value_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_self_modules_value_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_self_modules_value_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_19_modules_intermediate_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_19_modules_intermediate_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_19_modules_intermediate_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_19_modules_intermediate_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_19_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_19_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_19_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_19_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_19_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_19_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_19_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_19_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_self_modules_query_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_self_modules_query_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_self_modules_query_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_self_modules_query_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_self_modules_key_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_self_modules_key_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_self_modules_key_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_self_modules_key_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_self_modules_value_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_self_modules_value_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_self_modules_value_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_self_modules_value_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_20_modules_intermediate_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_20_modules_intermediate_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_20_modules_intermediate_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_20_modules_intermediate_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_20_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_20_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_20_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_20_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_20_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_20_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_20_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_20_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_self_modules_query_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_self_modules_query_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_self_modules_query_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_self_modules_query_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_self_modules_key_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_self_modules_key_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_self_modules_key_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_self_modules_key_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_self_modules_value_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_self_modules_value_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_self_modules_value_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_self_modules_value_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_21_modules_intermediate_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_21_modules_intermediate_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_21_modules_intermediate_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_21_modules_intermediate_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_21_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_21_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_21_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_21_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_21_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_21_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_21_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_21_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_self_modules_query_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_self_modules_query_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_self_modules_query_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_self_modules_query_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_self_modules_key_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_self_modules_key_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_self_modules_key_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_self_modules_key_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_self_modules_value_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_self_modules_value_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_self_modules_value_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_self_modules_value_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_22_modules_intermediate_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_22_modules_intermediate_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_22_modules_intermediate_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_22_modules_intermediate_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_22_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_22_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_22_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_22_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_22_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_22_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_22_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_22_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_self_modules_query_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_self_modules_query_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_self_modules_query_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_self_modules_query_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_self_modules_key_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_self_modules_key_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_self_modules_key_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_self_modules_key_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_self_modules_value_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_self_modules_value_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_self_modules_value_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_self_modules_value_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_23_modules_intermediate_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_23_modules_intermediate_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_23_modules_intermediate_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_23_modules_intermediate_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_23_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_23_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_23_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_23_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_23_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_23_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_23_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_23_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_pooler_modules_dense_parameters_weight_ = ( + L_self_modules_pooler_modules_dense_parameters_weight_ + ) + l_self_modules_pooler_modules_dense_parameters_bias_ = ( + L_self_modules_pooler_modules_dense_parameters_bias_ + ) + position_ids = l_self_modules_embeddings_buffers_position_ids_[ + (slice(None, None, None), slice(0, 11, None)) + ] + l_self_modules_embeddings_buffers_position_ids_ = None + inputs_embeds = torch.nn.functional.embedding( + l_input_ids_, + l_self_modules_embeddings_modules_word_embeddings_parameters_weight_, + 0, + None, + 2.0, + False, + False, + ) + l_input_ids_ = ( + l_self_modules_embeddings_modules_word_embeddings_parameters_weight_ + ) = None + token_type_embeddings = torch.nn.functional.embedding( + l_token_type_ids_, + l_self_modules_embeddings_modules_token_type_embeddings_parameters_weight_, + None, + None, + 2.0, + False, + False, + ) + l_token_type_ids_ = ( + l_self_modules_embeddings_modules_token_type_embeddings_parameters_weight_ + ) = None + embeddings = inputs_embeds + token_type_embeddings + inputs_embeds = token_type_embeddings = None + position_embeddings = torch.nn.functional.embedding( + position_ids, + l_self_modules_embeddings_modules_position_embeddings_parameters_weight_, + None, + None, + 2.0, + False, + False, + ) + position_ids = ( + l_self_modules_embeddings_modules_position_embeddings_parameters_weight_ + ) = None + embeddings += position_embeddings + embeddings_1 = embeddings + embeddings = position_embeddings = None + embeddings_2 = torch.nn.functional.layer_norm( + embeddings_1, + (1024,), + l_self_modules_embeddings_modules_layer_norm_parameters_weight_, + l_self_modules_embeddings_modules_layer_norm_parameters_bias_, + 1e-12, + ) + embeddings_1 = ( + l_self_modules_embeddings_modules_layer_norm_parameters_weight_ + ) = l_self_modules_embeddings_modules_layer_norm_parameters_bias_ = None + embeddings_3 = torch.nn.functional.dropout(embeddings_2, 0.0, False, False) + embeddings_2 = None + getitem_1 = l_attention_mask_[ + (slice(None, None, None), None, None, slice(None, None, None)) + ] + l_attention_mask_ = None + expand = getitem_1.expand(1, 1, 11, 11) + getitem_1 = None + expanded_mask = expand.to(torch.float32) + expand = None + tensor = torch.tensor(1.0, dtype=torch.float32) + inverted_mask = tensor - expanded_mask + tensor = expanded_mask = None + to_1 = inverted_mask.to(torch.bool) + extended_attention_mask = inverted_mask.masked_fill( + to_1, -3.4028234663852886e38 + ) + inverted_mask = to_1 = None + linear = torch._C._nn.linear( + embeddings_3, + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_query_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_query_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_query_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_query_parameters_bias_ = (None) + view = linear.view(1, -1, 16, 64) + linear = None + query_layer = view.transpose(1, 2) + view = None + linear_1 = torch._C._nn.linear( + embeddings_3, + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_key_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_key_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_key_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_key_parameters_bias_ = (None) + view_1 = linear_1.view(1, -1, 16, 64) + linear_1 = None + key_layer = view_1.transpose(1, 2) + view_1 = None + linear_2 = torch._C._nn.linear( + embeddings_3, + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_value_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_value_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_value_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_value_parameters_bias_ = (None) + view_2 = linear_2.view(1, -1, 16, 64) + linear_2 = None + value_layer = view_2.transpose(1, 2) + view_2 = None + attn_output = torch._C._nn.scaled_dot_product_attention( + query_layer, + key_layer, + value_layer, + attn_mask=extended_attention_mask, + dropout_p=0.0, + is_causal=False, + ) + query_layer = key_layer = value_layer = None + attn_output_1 = attn_output.transpose(1, 2) + attn_output = None + attn_output_2 = attn_output_1.reshape(1, 11, 1024) + attn_output_1 = None + hidden_states = torch._C._nn.linear( + attn_output_2, + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dense_parameters_bias_, + ) + attn_output_2 = l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_1 = torch.nn.functional.dropout(hidden_states, 0.0, False, False) + hidden_states = None + add_1 = hidden_states_1 + embeddings_3 + hidden_states_1 = embeddings_3 = None + hidden_states_2 = torch.nn.functional.layer_norm( + add_1, + (1024,), + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_layer_norm_parameters_bias_, + 1e-12, + ) + add_1 = l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = (None) + hidden_states_3 = torch._C._nn.linear( + hidden_states_2, + l_self_modules_encoder_modules_layer_modules_0_modules_intermediate_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_0_modules_intermediate_modules_dense_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_0_modules_intermediate_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_0_modules_intermediate_modules_dense_parameters_bias_ = (None) + hidden_states_4 = torch._C._nn.gelu(hidden_states_3) + hidden_states_3 = None + hidden_states_5 = torch._C._nn.linear( + hidden_states_4, + l_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dense_parameters_bias_, + ) + hidden_states_4 = l_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_6 = torch.nn.functional.dropout( + hidden_states_5, 0.0, False, False + ) + hidden_states_5 = None + add_2 = hidden_states_6 + hidden_states_2 + hidden_states_6 = hidden_states_2 = None + hidden_states_7 = torch.nn.functional.layer_norm( + add_2, + (1024,), + l_self_modules_encoder_modules_layer_modules_0_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_0_modules_output_modules_layer_norm_parameters_bias_, + 1e-12, + ) + add_2 = l_self_modules_encoder_modules_layer_modules_0_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_0_modules_output_modules_layer_norm_parameters_bias_ = (None) + linear_6 = torch._C._nn.linear( + hidden_states_7, + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_query_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_query_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_query_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_query_parameters_bias_ = (None) + view_3 = linear_6.view(1, -1, 16, 64) + linear_6 = None + query_layer_1 = view_3.transpose(1, 2) + view_3 = None + linear_7 = torch._C._nn.linear( + hidden_states_7, + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_key_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_key_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_key_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_key_parameters_bias_ = (None) + view_4 = linear_7.view(1, -1, 16, 64) + linear_7 = None + key_layer_1 = view_4.transpose(1, 2) + view_4 = None + linear_8 = torch._C._nn.linear( + hidden_states_7, + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_value_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_value_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_value_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_value_parameters_bias_ = (None) + view_5 = linear_8.view(1, -1, 16, 64) + linear_8 = None + value_layer_1 = view_5.transpose(1, 2) + view_5 = None + attn_output_3 = torch._C._nn.scaled_dot_product_attention( + query_layer_1, + key_layer_1, + value_layer_1, + attn_mask=extended_attention_mask, + dropout_p=0.0, + is_causal=False, + ) + query_layer_1 = key_layer_1 = value_layer_1 = None + attn_output_4 = attn_output_3.transpose(1, 2) + attn_output_3 = None + attn_output_5 = attn_output_4.reshape(1, 11, 1024) + attn_output_4 = None + hidden_states_8 = torch._C._nn.linear( + attn_output_5, + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dense_parameters_bias_, + ) + attn_output_5 = l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_9 = torch.nn.functional.dropout( + hidden_states_8, 0.0, False, False + ) + hidden_states_8 = None + add_3 = hidden_states_9 + hidden_states_7 + hidden_states_9 = hidden_states_7 = None + hidden_states_10 = torch.nn.functional.layer_norm( + add_3, + (1024,), + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_layer_norm_parameters_bias_, + 1e-12, + ) + add_3 = l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = (None) + hidden_states_11 = torch._C._nn.linear( + hidden_states_10, + l_self_modules_encoder_modules_layer_modules_1_modules_intermediate_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_1_modules_intermediate_modules_dense_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_1_modules_intermediate_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_1_modules_intermediate_modules_dense_parameters_bias_ = (None) + hidden_states_12 = torch._C._nn.gelu(hidden_states_11) + hidden_states_11 = None + hidden_states_13 = torch._C._nn.linear( + hidden_states_12, + l_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dense_parameters_bias_, + ) + hidden_states_12 = l_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_14 = torch.nn.functional.dropout( + hidden_states_13, 0.0, False, False + ) + hidden_states_13 = None + add_4 = hidden_states_14 + hidden_states_10 + hidden_states_14 = hidden_states_10 = None + hidden_states_15 = torch.nn.functional.layer_norm( + add_4, + (1024,), + l_self_modules_encoder_modules_layer_modules_1_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_1_modules_output_modules_layer_norm_parameters_bias_, + 1e-12, + ) + add_4 = l_self_modules_encoder_modules_layer_modules_1_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_1_modules_output_modules_layer_norm_parameters_bias_ = (None) + linear_12 = torch._C._nn.linear( + hidden_states_15, + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_query_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_query_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_query_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_query_parameters_bias_ = (None) + view_6 = linear_12.view(1, -1, 16, 64) + linear_12 = None + query_layer_2 = view_6.transpose(1, 2) + view_6 = None + linear_13 = torch._C._nn.linear( + hidden_states_15, + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_key_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_key_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_key_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_key_parameters_bias_ = (None) + view_7 = linear_13.view(1, -1, 16, 64) + linear_13 = None + key_layer_2 = view_7.transpose(1, 2) + view_7 = None + linear_14 = torch._C._nn.linear( + hidden_states_15, + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_value_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_value_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_value_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_value_parameters_bias_ = (None) + view_8 = linear_14.view(1, -1, 16, 64) + linear_14 = None + value_layer_2 = view_8.transpose(1, 2) + view_8 = None + attn_output_6 = torch._C._nn.scaled_dot_product_attention( + query_layer_2, + key_layer_2, + value_layer_2, + attn_mask=extended_attention_mask, + dropout_p=0.0, + is_causal=False, + ) + query_layer_2 = key_layer_2 = value_layer_2 = None + attn_output_7 = attn_output_6.transpose(1, 2) + attn_output_6 = None + attn_output_8 = attn_output_7.reshape(1, 11, 1024) + attn_output_7 = None + hidden_states_16 = torch._C._nn.linear( + attn_output_8, + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_dense_parameters_bias_, + ) + attn_output_8 = l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_17 = torch.nn.functional.dropout( + hidden_states_16, 0.0, False, False + ) + hidden_states_16 = None + add_5 = hidden_states_17 + hidden_states_15 + hidden_states_17 = hidden_states_15 = None + hidden_states_18 = torch.nn.functional.layer_norm( + add_5, + (1024,), + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_layer_norm_parameters_bias_, + 1e-12, + ) + add_5 = l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = (None) + hidden_states_19 = torch._C._nn.linear( + hidden_states_18, + l_self_modules_encoder_modules_layer_modules_2_modules_intermediate_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_2_modules_intermediate_modules_dense_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_2_modules_intermediate_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_2_modules_intermediate_modules_dense_parameters_bias_ = (None) + hidden_states_20 = torch._C._nn.gelu(hidden_states_19) + hidden_states_19 = None + hidden_states_21 = torch._C._nn.linear( + hidden_states_20, + l_self_modules_encoder_modules_layer_modules_2_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_2_modules_output_modules_dense_parameters_bias_, + ) + hidden_states_20 = l_self_modules_encoder_modules_layer_modules_2_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_2_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_22 = torch.nn.functional.dropout( + hidden_states_21, 0.0, False, False + ) + hidden_states_21 = None + add_6 = hidden_states_22 + hidden_states_18 + hidden_states_22 = hidden_states_18 = None + hidden_states_23 = torch.nn.functional.layer_norm( + add_6, + (1024,), + l_self_modules_encoder_modules_layer_modules_2_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_2_modules_output_modules_layer_norm_parameters_bias_, + 1e-12, + ) + add_6 = l_self_modules_encoder_modules_layer_modules_2_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_2_modules_output_modules_layer_norm_parameters_bias_ = (None) + linear_18 = torch._C._nn.linear( + hidden_states_23, + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_query_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_query_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_query_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_query_parameters_bias_ = (None) + view_9 = linear_18.view(1, -1, 16, 64) + linear_18 = None + query_layer_3 = view_9.transpose(1, 2) + view_9 = None + linear_19 = torch._C._nn.linear( + hidden_states_23, + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_key_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_key_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_key_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_key_parameters_bias_ = (None) + view_10 = linear_19.view(1, -1, 16, 64) + linear_19 = None + key_layer_3 = view_10.transpose(1, 2) + view_10 = None + linear_20 = torch._C._nn.linear( + hidden_states_23, + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_value_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_value_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_value_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_value_parameters_bias_ = (None) + view_11 = linear_20.view(1, -1, 16, 64) + linear_20 = None + value_layer_3 = view_11.transpose(1, 2) + view_11 = None + attn_output_9 = torch._C._nn.scaled_dot_product_attention( + query_layer_3, + key_layer_3, + value_layer_3, + attn_mask=extended_attention_mask, + dropout_p=0.0, + is_causal=False, + ) + query_layer_3 = key_layer_3 = value_layer_3 = None + attn_output_10 = attn_output_9.transpose(1, 2) + attn_output_9 = None + attn_output_11 = attn_output_10.reshape(1, 11, 1024) + attn_output_10 = None + hidden_states_24 = torch._C._nn.linear( + attn_output_11, + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_dense_parameters_bias_, + ) + attn_output_11 = l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_25 = torch.nn.functional.dropout( + hidden_states_24, 0.0, False, False + ) + hidden_states_24 = None + add_7 = hidden_states_25 + hidden_states_23 + hidden_states_25 = hidden_states_23 = None + hidden_states_26 = torch.nn.functional.layer_norm( + add_7, + (1024,), + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_layer_norm_parameters_bias_, + 1e-12, + ) + add_7 = l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = (None) + hidden_states_27 = torch._C._nn.linear( + hidden_states_26, + l_self_modules_encoder_modules_layer_modules_3_modules_intermediate_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_3_modules_intermediate_modules_dense_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_3_modules_intermediate_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_3_modules_intermediate_modules_dense_parameters_bias_ = (None) + hidden_states_28 = torch._C._nn.gelu(hidden_states_27) + hidden_states_27 = None + hidden_states_29 = torch._C._nn.linear( + hidden_states_28, + l_self_modules_encoder_modules_layer_modules_3_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_3_modules_output_modules_dense_parameters_bias_, + ) + hidden_states_28 = l_self_modules_encoder_modules_layer_modules_3_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_3_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_30 = torch.nn.functional.dropout( + hidden_states_29, 0.0, False, False + ) + hidden_states_29 = None + add_8 = hidden_states_30 + hidden_states_26 + hidden_states_30 = hidden_states_26 = None + hidden_states_31 = torch.nn.functional.layer_norm( + add_8, + (1024,), + l_self_modules_encoder_modules_layer_modules_3_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_3_modules_output_modules_layer_norm_parameters_bias_, + 1e-12, + ) + add_8 = l_self_modules_encoder_modules_layer_modules_3_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_3_modules_output_modules_layer_norm_parameters_bias_ = (None) + linear_24 = torch._C._nn.linear( + hidden_states_31, + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_query_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_query_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_query_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_query_parameters_bias_ = (None) + view_12 = linear_24.view(1, -1, 16, 64) + linear_24 = None + query_layer_4 = view_12.transpose(1, 2) + view_12 = None + linear_25 = torch._C._nn.linear( + hidden_states_31, + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_key_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_key_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_key_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_key_parameters_bias_ = (None) + view_13 = linear_25.view(1, -1, 16, 64) + linear_25 = None + key_layer_4 = view_13.transpose(1, 2) + view_13 = None + linear_26 = torch._C._nn.linear( + hidden_states_31, + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_value_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_value_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_value_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_value_parameters_bias_ = (None) + view_14 = linear_26.view(1, -1, 16, 64) + linear_26 = None + value_layer_4 = view_14.transpose(1, 2) + view_14 = None + attn_output_12 = torch._C._nn.scaled_dot_product_attention( + query_layer_4, + key_layer_4, + value_layer_4, + attn_mask=extended_attention_mask, + dropout_p=0.0, + is_causal=False, + ) + query_layer_4 = key_layer_4 = value_layer_4 = None + attn_output_13 = attn_output_12.transpose(1, 2) + attn_output_12 = None + attn_output_14 = attn_output_13.reshape(1, 11, 1024) + attn_output_13 = None + hidden_states_32 = torch._C._nn.linear( + attn_output_14, + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_dense_parameters_bias_, + ) + attn_output_14 = l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_33 = torch.nn.functional.dropout( + hidden_states_32, 0.0, False, False + ) + hidden_states_32 = None + add_9 = hidden_states_33 + hidden_states_31 + hidden_states_33 = hidden_states_31 = None + hidden_states_34 = torch.nn.functional.layer_norm( + add_9, + (1024,), + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_layer_norm_parameters_bias_, + 1e-12, + ) + add_9 = l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = (None) + hidden_states_35 = torch._C._nn.linear( + hidden_states_34, + l_self_modules_encoder_modules_layer_modules_4_modules_intermediate_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_4_modules_intermediate_modules_dense_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_4_modules_intermediate_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_4_modules_intermediate_modules_dense_parameters_bias_ = (None) + hidden_states_36 = torch._C._nn.gelu(hidden_states_35) + hidden_states_35 = None + hidden_states_37 = torch._C._nn.linear( + hidden_states_36, + l_self_modules_encoder_modules_layer_modules_4_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_4_modules_output_modules_dense_parameters_bias_, + ) + hidden_states_36 = l_self_modules_encoder_modules_layer_modules_4_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_4_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_38 = torch.nn.functional.dropout( + hidden_states_37, 0.0, False, False + ) + hidden_states_37 = None + add_10 = hidden_states_38 + hidden_states_34 + hidden_states_38 = hidden_states_34 = None + hidden_states_39 = torch.nn.functional.layer_norm( + add_10, + (1024,), + l_self_modules_encoder_modules_layer_modules_4_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_4_modules_output_modules_layer_norm_parameters_bias_, + 1e-12, + ) + add_10 = l_self_modules_encoder_modules_layer_modules_4_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_4_modules_output_modules_layer_norm_parameters_bias_ = (None) + linear_30 = torch._C._nn.linear( + hidden_states_39, + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_query_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_query_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_query_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_query_parameters_bias_ = (None) + view_15 = linear_30.view(1, -1, 16, 64) + linear_30 = None + query_layer_5 = view_15.transpose(1, 2) + view_15 = None + linear_31 = torch._C._nn.linear( + hidden_states_39, + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_key_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_key_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_key_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_key_parameters_bias_ = (None) + view_16 = linear_31.view(1, -1, 16, 64) + linear_31 = None + key_layer_5 = view_16.transpose(1, 2) + view_16 = None + linear_32 = torch._C._nn.linear( + hidden_states_39, + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_value_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_value_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_value_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_value_parameters_bias_ = (None) + view_17 = linear_32.view(1, -1, 16, 64) + linear_32 = None + value_layer_5 = view_17.transpose(1, 2) + view_17 = None + attn_output_15 = torch._C._nn.scaled_dot_product_attention( + query_layer_5, + key_layer_5, + value_layer_5, + attn_mask=extended_attention_mask, + dropout_p=0.0, + is_causal=False, + ) + query_layer_5 = key_layer_5 = value_layer_5 = None + attn_output_16 = attn_output_15.transpose(1, 2) + attn_output_15 = None + attn_output_17 = attn_output_16.reshape(1, 11, 1024) + attn_output_16 = None + hidden_states_40 = torch._C._nn.linear( + attn_output_17, + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_dense_parameters_bias_, + ) + attn_output_17 = l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_41 = torch.nn.functional.dropout( + hidden_states_40, 0.0, False, False + ) + hidden_states_40 = None + add_11 = hidden_states_41 + hidden_states_39 + hidden_states_41 = hidden_states_39 = None + hidden_states_42 = torch.nn.functional.layer_norm( + add_11, + (1024,), + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_layer_norm_parameters_bias_, + 1e-12, + ) + add_11 = l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = (None) + hidden_states_43 = torch._C._nn.linear( + hidden_states_42, + l_self_modules_encoder_modules_layer_modules_5_modules_intermediate_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_5_modules_intermediate_modules_dense_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_5_modules_intermediate_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_5_modules_intermediate_modules_dense_parameters_bias_ = (None) + hidden_states_44 = torch._C._nn.gelu(hidden_states_43) + hidden_states_43 = None + hidden_states_45 = torch._C._nn.linear( + hidden_states_44, + l_self_modules_encoder_modules_layer_modules_5_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_5_modules_output_modules_dense_parameters_bias_, + ) + hidden_states_44 = l_self_modules_encoder_modules_layer_modules_5_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_5_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_46 = torch.nn.functional.dropout( + hidden_states_45, 0.0, False, False + ) + hidden_states_45 = None + add_12 = hidden_states_46 + hidden_states_42 + hidden_states_46 = hidden_states_42 = None + hidden_states_47 = torch.nn.functional.layer_norm( + add_12, + (1024,), + l_self_modules_encoder_modules_layer_modules_5_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_5_modules_output_modules_layer_norm_parameters_bias_, + 1e-12, + ) + add_12 = l_self_modules_encoder_modules_layer_modules_5_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_5_modules_output_modules_layer_norm_parameters_bias_ = (None) + linear_36 = torch._C._nn.linear( + hidden_states_47, + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_query_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_query_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_query_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_query_parameters_bias_ = (None) + view_18 = linear_36.view(1, -1, 16, 64) + linear_36 = None + query_layer_6 = view_18.transpose(1, 2) + view_18 = None + linear_37 = torch._C._nn.linear( + hidden_states_47, + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_key_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_key_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_key_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_key_parameters_bias_ = (None) + view_19 = linear_37.view(1, -1, 16, 64) + linear_37 = None + key_layer_6 = view_19.transpose(1, 2) + view_19 = None + linear_38 = torch._C._nn.linear( + hidden_states_47, + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_value_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_value_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_value_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_value_parameters_bias_ = (None) + view_20 = linear_38.view(1, -1, 16, 64) + linear_38 = None + value_layer_6 = view_20.transpose(1, 2) + view_20 = None + attn_output_18 = torch._C._nn.scaled_dot_product_attention( + query_layer_6, + key_layer_6, + value_layer_6, + attn_mask=extended_attention_mask, + dropout_p=0.0, + is_causal=False, + ) + query_layer_6 = key_layer_6 = value_layer_6 = None + attn_output_19 = attn_output_18.transpose(1, 2) + attn_output_18 = None + attn_output_20 = attn_output_19.reshape(1, 11, 1024) + attn_output_19 = None + hidden_states_48 = torch._C._nn.linear( + attn_output_20, + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_dense_parameters_bias_, + ) + attn_output_20 = l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_49 = torch.nn.functional.dropout( + hidden_states_48, 0.0, False, False + ) + hidden_states_48 = None + add_13 = hidden_states_49 + hidden_states_47 + hidden_states_49 = hidden_states_47 = None + hidden_states_50 = torch.nn.functional.layer_norm( + add_13, + (1024,), + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_layer_norm_parameters_bias_, + 1e-12, + ) + add_13 = l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = (None) + hidden_states_51 = torch._C._nn.linear( + hidden_states_50, + l_self_modules_encoder_modules_layer_modules_6_modules_intermediate_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_6_modules_intermediate_modules_dense_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_6_modules_intermediate_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_6_modules_intermediate_modules_dense_parameters_bias_ = (None) + hidden_states_52 = torch._C._nn.gelu(hidden_states_51) + hidden_states_51 = None + hidden_states_53 = torch._C._nn.linear( + hidden_states_52, + l_self_modules_encoder_modules_layer_modules_6_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_6_modules_output_modules_dense_parameters_bias_, + ) + hidden_states_52 = l_self_modules_encoder_modules_layer_modules_6_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_6_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_54 = torch.nn.functional.dropout( + hidden_states_53, 0.0, False, False + ) + hidden_states_53 = None + add_14 = hidden_states_54 + hidden_states_50 + hidden_states_54 = hidden_states_50 = None + hidden_states_55 = torch.nn.functional.layer_norm( + add_14, + (1024,), + l_self_modules_encoder_modules_layer_modules_6_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_6_modules_output_modules_layer_norm_parameters_bias_, + 1e-12, + ) + add_14 = l_self_modules_encoder_modules_layer_modules_6_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_6_modules_output_modules_layer_norm_parameters_bias_ = (None) + linear_42 = torch._C._nn.linear( + hidden_states_55, + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_query_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_query_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_query_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_query_parameters_bias_ = (None) + view_21 = linear_42.view(1, -1, 16, 64) + linear_42 = None + query_layer_7 = view_21.transpose(1, 2) + view_21 = None + linear_43 = torch._C._nn.linear( + hidden_states_55, + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_key_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_key_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_key_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_key_parameters_bias_ = (None) + view_22 = linear_43.view(1, -1, 16, 64) + linear_43 = None + key_layer_7 = view_22.transpose(1, 2) + view_22 = None + linear_44 = torch._C._nn.linear( + hidden_states_55, + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_value_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_value_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_value_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_value_parameters_bias_ = (None) + view_23 = linear_44.view(1, -1, 16, 64) + linear_44 = None + value_layer_7 = view_23.transpose(1, 2) + view_23 = None + attn_output_21 = torch._C._nn.scaled_dot_product_attention( + query_layer_7, + key_layer_7, + value_layer_7, + attn_mask=extended_attention_mask, + dropout_p=0.0, + is_causal=False, + ) + query_layer_7 = key_layer_7 = value_layer_7 = None + attn_output_22 = attn_output_21.transpose(1, 2) + attn_output_21 = None + attn_output_23 = attn_output_22.reshape(1, 11, 1024) + attn_output_22 = None + hidden_states_56 = torch._C._nn.linear( + attn_output_23, + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_dense_parameters_bias_, + ) + attn_output_23 = l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_57 = torch.nn.functional.dropout( + hidden_states_56, 0.0, False, False + ) + hidden_states_56 = None + add_15 = hidden_states_57 + hidden_states_55 + hidden_states_57 = hidden_states_55 = None + hidden_states_58 = torch.nn.functional.layer_norm( + add_15, + (1024,), + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_layer_norm_parameters_bias_, + 1e-12, + ) + add_15 = l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = (None) + hidden_states_59 = torch._C._nn.linear( + hidden_states_58, + l_self_modules_encoder_modules_layer_modules_7_modules_intermediate_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_7_modules_intermediate_modules_dense_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_7_modules_intermediate_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_7_modules_intermediate_modules_dense_parameters_bias_ = (None) + hidden_states_60 = torch._C._nn.gelu(hidden_states_59) + hidden_states_59 = None + hidden_states_61 = torch._C._nn.linear( + hidden_states_60, + l_self_modules_encoder_modules_layer_modules_7_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_7_modules_output_modules_dense_parameters_bias_, + ) + hidden_states_60 = l_self_modules_encoder_modules_layer_modules_7_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_7_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_62 = torch.nn.functional.dropout( + hidden_states_61, 0.0, False, False + ) + hidden_states_61 = None + add_16 = hidden_states_62 + hidden_states_58 + hidden_states_62 = hidden_states_58 = None + hidden_states_63 = torch.nn.functional.layer_norm( + add_16, + (1024,), + l_self_modules_encoder_modules_layer_modules_7_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_7_modules_output_modules_layer_norm_parameters_bias_, + 1e-12, + ) + add_16 = l_self_modules_encoder_modules_layer_modules_7_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_7_modules_output_modules_layer_norm_parameters_bias_ = (None) + linear_48 = torch._C._nn.linear( + hidden_states_63, + l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_query_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_query_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_query_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_query_parameters_bias_ = (None) + view_24 = linear_48.view(1, -1, 16, 64) + linear_48 = None + query_layer_8 = view_24.transpose(1, 2) + view_24 = None + linear_49 = torch._C._nn.linear( + hidden_states_63, + l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_key_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_key_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_key_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_key_parameters_bias_ = (None) + view_25 = linear_49.view(1, -1, 16, 64) + linear_49 = None + key_layer_8 = view_25.transpose(1, 2) + view_25 = None + linear_50 = torch._C._nn.linear( + hidden_states_63, + l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_value_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_value_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_value_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_value_parameters_bias_ = (None) + view_26 = linear_50.view(1, -1, 16, 64) + linear_50 = None + value_layer_8 = view_26.transpose(1, 2) + view_26 = None + attn_output_24 = torch._C._nn.scaled_dot_product_attention( + query_layer_8, + key_layer_8, + value_layer_8, + attn_mask=extended_attention_mask, + dropout_p=0.0, + is_causal=False, + ) + query_layer_8 = key_layer_8 = value_layer_8 = None + attn_output_25 = attn_output_24.transpose(1, 2) + attn_output_24 = None + attn_output_26 = attn_output_25.reshape(1, 11, 1024) + attn_output_25 = None + hidden_states_64 = torch._C._nn.linear( + attn_output_26, + l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_dense_parameters_bias_, + ) + attn_output_26 = l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_65 = torch.nn.functional.dropout( + hidden_states_64, 0.0, False, False + ) + hidden_states_64 = None + add_17 = hidden_states_65 + hidden_states_63 + hidden_states_65 = hidden_states_63 = None + hidden_states_66 = torch.nn.functional.layer_norm( + add_17, + (1024,), + l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_layer_norm_parameters_bias_, + 1e-12, + ) + add_17 = l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = (None) + hidden_states_67 = torch._C._nn.linear( + hidden_states_66, + l_self_modules_encoder_modules_layer_modules_8_modules_intermediate_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_8_modules_intermediate_modules_dense_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_8_modules_intermediate_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_8_modules_intermediate_modules_dense_parameters_bias_ = (None) + hidden_states_68 = torch._C._nn.gelu(hidden_states_67) + hidden_states_67 = None + hidden_states_69 = torch._C._nn.linear( + hidden_states_68, + l_self_modules_encoder_modules_layer_modules_8_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_8_modules_output_modules_dense_parameters_bias_, + ) + hidden_states_68 = l_self_modules_encoder_modules_layer_modules_8_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_8_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_70 = torch.nn.functional.dropout( + hidden_states_69, 0.0, False, False + ) + hidden_states_69 = None + add_18 = hidden_states_70 + hidden_states_66 + hidden_states_70 = hidden_states_66 = None + hidden_states_71 = torch.nn.functional.layer_norm( + add_18, + (1024,), + l_self_modules_encoder_modules_layer_modules_8_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_8_modules_output_modules_layer_norm_parameters_bias_, + 1e-12, + ) + add_18 = l_self_modules_encoder_modules_layer_modules_8_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_8_modules_output_modules_layer_norm_parameters_bias_ = (None) + linear_54 = torch._C._nn.linear( + hidden_states_71, + l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_query_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_query_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_query_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_query_parameters_bias_ = (None) + view_27 = linear_54.view(1, -1, 16, 64) + linear_54 = None + query_layer_9 = view_27.transpose(1, 2) + view_27 = None + linear_55 = torch._C._nn.linear( + hidden_states_71, + l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_key_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_key_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_key_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_key_parameters_bias_ = (None) + view_28 = linear_55.view(1, -1, 16, 64) + linear_55 = None + key_layer_9 = view_28.transpose(1, 2) + view_28 = None + linear_56 = torch._C._nn.linear( + hidden_states_71, + l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_value_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_value_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_value_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_value_parameters_bias_ = (None) + view_29 = linear_56.view(1, -1, 16, 64) + linear_56 = None + value_layer_9 = view_29.transpose(1, 2) + view_29 = None + attn_output_27 = torch._C._nn.scaled_dot_product_attention( + query_layer_9, + key_layer_9, + value_layer_9, + attn_mask=extended_attention_mask, + dropout_p=0.0, + is_causal=False, + ) + query_layer_9 = key_layer_9 = value_layer_9 = None + attn_output_28 = attn_output_27.transpose(1, 2) + attn_output_27 = None + attn_output_29 = attn_output_28.reshape(1, 11, 1024) + attn_output_28 = None + hidden_states_72 = torch._C._nn.linear( + attn_output_29, + l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_dense_parameters_bias_, + ) + attn_output_29 = l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_73 = torch.nn.functional.dropout( + hidden_states_72, 0.0, False, False + ) + hidden_states_72 = None + add_19 = hidden_states_73 + hidden_states_71 + hidden_states_73 = hidden_states_71 = None + hidden_states_74 = torch.nn.functional.layer_norm( + add_19, + (1024,), + l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_layer_norm_parameters_bias_, + 1e-12, + ) + add_19 = l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = (None) + hidden_states_75 = torch._C._nn.linear( + hidden_states_74, + l_self_modules_encoder_modules_layer_modules_9_modules_intermediate_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_9_modules_intermediate_modules_dense_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_9_modules_intermediate_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_9_modules_intermediate_modules_dense_parameters_bias_ = (None) + hidden_states_76 = torch._C._nn.gelu(hidden_states_75) + hidden_states_75 = None + hidden_states_77 = torch._C._nn.linear( + hidden_states_76, + l_self_modules_encoder_modules_layer_modules_9_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_9_modules_output_modules_dense_parameters_bias_, + ) + hidden_states_76 = l_self_modules_encoder_modules_layer_modules_9_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_9_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_78 = torch.nn.functional.dropout( + hidden_states_77, 0.0, False, False + ) + hidden_states_77 = None + add_20 = hidden_states_78 + hidden_states_74 + hidden_states_78 = hidden_states_74 = None + hidden_states_79 = torch.nn.functional.layer_norm( + add_20, + (1024,), + l_self_modules_encoder_modules_layer_modules_9_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_9_modules_output_modules_layer_norm_parameters_bias_, + 1e-12, + ) + add_20 = l_self_modules_encoder_modules_layer_modules_9_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_9_modules_output_modules_layer_norm_parameters_bias_ = (None) + linear_60 = torch._C._nn.linear( + hidden_states_79, + l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_query_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_query_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_query_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_query_parameters_bias_ = (None) + view_30 = linear_60.view(1, -1, 16, 64) + linear_60 = None + query_layer_10 = view_30.transpose(1, 2) + view_30 = None + linear_61 = torch._C._nn.linear( + hidden_states_79, + l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_key_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_key_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_key_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_key_parameters_bias_ = (None) + view_31 = linear_61.view(1, -1, 16, 64) + linear_61 = None + key_layer_10 = view_31.transpose(1, 2) + view_31 = None + linear_62 = torch._C._nn.linear( + hidden_states_79, + l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_value_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_value_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_value_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_value_parameters_bias_ = (None) + view_32 = linear_62.view(1, -1, 16, 64) + linear_62 = None + value_layer_10 = view_32.transpose(1, 2) + view_32 = None + attn_output_30 = torch._C._nn.scaled_dot_product_attention( + query_layer_10, + key_layer_10, + value_layer_10, + attn_mask=extended_attention_mask, + dropout_p=0.0, + is_causal=False, + ) + query_layer_10 = key_layer_10 = value_layer_10 = None + attn_output_31 = attn_output_30.transpose(1, 2) + attn_output_30 = None + attn_output_32 = attn_output_31.reshape(1, 11, 1024) + attn_output_31 = None + hidden_states_80 = torch._C._nn.linear( + attn_output_32, + l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_dense_parameters_bias_, + ) + attn_output_32 = l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_81 = torch.nn.functional.dropout( + hidden_states_80, 0.0, False, False + ) + hidden_states_80 = None + add_21 = hidden_states_81 + hidden_states_79 + hidden_states_81 = hidden_states_79 = None + hidden_states_82 = torch.nn.functional.layer_norm( + add_21, + (1024,), + l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_layer_norm_parameters_bias_, + 1e-12, + ) + add_21 = l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = (None) + hidden_states_83 = torch._C._nn.linear( + hidden_states_82, + l_self_modules_encoder_modules_layer_modules_10_modules_intermediate_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_10_modules_intermediate_modules_dense_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_10_modules_intermediate_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_10_modules_intermediate_modules_dense_parameters_bias_ = (None) + hidden_states_84 = torch._C._nn.gelu(hidden_states_83) + hidden_states_83 = None + hidden_states_85 = torch._C._nn.linear( + hidden_states_84, + l_self_modules_encoder_modules_layer_modules_10_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_10_modules_output_modules_dense_parameters_bias_, + ) + hidden_states_84 = l_self_modules_encoder_modules_layer_modules_10_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_10_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_86 = torch.nn.functional.dropout( + hidden_states_85, 0.0, False, False + ) + hidden_states_85 = None + add_22 = hidden_states_86 + hidden_states_82 + hidden_states_86 = hidden_states_82 = None + hidden_states_87 = torch.nn.functional.layer_norm( + add_22, + (1024,), + l_self_modules_encoder_modules_layer_modules_10_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_10_modules_output_modules_layer_norm_parameters_bias_, + 1e-12, + ) + add_22 = l_self_modules_encoder_modules_layer_modules_10_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_10_modules_output_modules_layer_norm_parameters_bias_ = (None) + linear_66 = torch._C._nn.linear( + hidden_states_87, + l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_query_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_query_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_query_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_query_parameters_bias_ = (None) + view_33 = linear_66.view(1, -1, 16, 64) + linear_66 = None + query_layer_11 = view_33.transpose(1, 2) + view_33 = None + linear_67 = torch._C._nn.linear( + hidden_states_87, + l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_key_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_key_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_key_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_key_parameters_bias_ = (None) + view_34 = linear_67.view(1, -1, 16, 64) + linear_67 = None + key_layer_11 = view_34.transpose(1, 2) + view_34 = None + linear_68 = torch._C._nn.linear( + hidden_states_87, + l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_value_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_value_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_value_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_value_parameters_bias_ = (None) + view_35 = linear_68.view(1, -1, 16, 64) + linear_68 = None + value_layer_11 = view_35.transpose(1, 2) + view_35 = None + attn_output_33 = torch._C._nn.scaled_dot_product_attention( + query_layer_11, + key_layer_11, + value_layer_11, + attn_mask=extended_attention_mask, + dropout_p=0.0, + is_causal=False, + ) + query_layer_11 = key_layer_11 = value_layer_11 = None + attn_output_34 = attn_output_33.transpose(1, 2) + attn_output_33 = None + attn_output_35 = attn_output_34.reshape(1, 11, 1024) + attn_output_34 = None + hidden_states_88 = torch._C._nn.linear( + attn_output_35, + l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_dense_parameters_bias_, + ) + attn_output_35 = l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_89 = torch.nn.functional.dropout( + hidden_states_88, 0.0, False, False + ) + hidden_states_88 = None + add_23 = hidden_states_89 + hidden_states_87 + hidden_states_89 = hidden_states_87 = None + hidden_states_90 = torch.nn.functional.layer_norm( + add_23, + (1024,), + l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_layer_norm_parameters_bias_, + 1e-12, + ) + add_23 = l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = (None) + hidden_states_91 = torch._C._nn.linear( + hidden_states_90, + l_self_modules_encoder_modules_layer_modules_11_modules_intermediate_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_11_modules_intermediate_modules_dense_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_11_modules_intermediate_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_11_modules_intermediate_modules_dense_parameters_bias_ = (None) + hidden_states_92 = torch._C._nn.gelu(hidden_states_91) + hidden_states_91 = None + hidden_states_93 = torch._C._nn.linear( + hidden_states_92, + l_self_modules_encoder_modules_layer_modules_11_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_11_modules_output_modules_dense_parameters_bias_, + ) + hidden_states_92 = l_self_modules_encoder_modules_layer_modules_11_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_11_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_94 = torch.nn.functional.dropout( + hidden_states_93, 0.0, False, False + ) + hidden_states_93 = None + add_24 = hidden_states_94 + hidden_states_90 + hidden_states_94 = hidden_states_90 = None + hidden_states_95 = torch.nn.functional.layer_norm( + add_24, + (1024,), + l_self_modules_encoder_modules_layer_modules_11_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_11_modules_output_modules_layer_norm_parameters_bias_, + 1e-12, + ) + add_24 = l_self_modules_encoder_modules_layer_modules_11_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_11_modules_output_modules_layer_norm_parameters_bias_ = (None) + linear_72 = torch._C._nn.linear( + hidden_states_95, + l_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_self_modules_query_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_self_modules_query_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_self_modules_query_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_self_modules_query_parameters_bias_ = (None) + view_36 = linear_72.view(1, -1, 16, 64) + linear_72 = None + query_layer_12 = view_36.transpose(1, 2) + view_36 = None + linear_73 = torch._C._nn.linear( + hidden_states_95, + l_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_self_modules_key_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_self_modules_key_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_self_modules_key_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_self_modules_key_parameters_bias_ = (None) + view_37 = linear_73.view(1, -1, 16, 64) + linear_73 = None + key_layer_12 = view_37.transpose(1, 2) + view_37 = None + linear_74 = torch._C._nn.linear( + hidden_states_95, + l_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_self_modules_value_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_self_modules_value_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_self_modules_value_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_self_modules_value_parameters_bias_ = (None) + view_38 = linear_74.view(1, -1, 16, 64) + linear_74 = None + value_layer_12 = view_38.transpose(1, 2) + view_38 = None + attn_output_36 = torch._C._nn.scaled_dot_product_attention( + query_layer_12, + key_layer_12, + value_layer_12, + attn_mask=extended_attention_mask, + dropout_p=0.0, + is_causal=False, + ) + query_layer_12 = key_layer_12 = value_layer_12 = None + attn_output_37 = attn_output_36.transpose(1, 2) + attn_output_36 = None + attn_output_38 = attn_output_37.reshape(1, 11, 1024) + attn_output_37 = None + hidden_states_96 = torch._C._nn.linear( + attn_output_38, + l_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_output_modules_dense_parameters_bias_, + ) + attn_output_38 = l_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_97 = torch.nn.functional.dropout( + hidden_states_96, 0.0, False, False + ) + hidden_states_96 = None + add_25 = hidden_states_97 + hidden_states_95 + hidden_states_97 = hidden_states_95 = None + hidden_states_98 = torch.nn.functional.layer_norm( + add_25, + (1024,), + l_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_output_modules_layer_norm_parameters_bias_, + 1e-12, + ) + add_25 = l_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = (None) + hidden_states_99 = torch._C._nn.linear( + hidden_states_98, + l_self_modules_encoder_modules_layer_modules_12_modules_intermediate_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_12_modules_intermediate_modules_dense_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_12_modules_intermediate_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_12_modules_intermediate_modules_dense_parameters_bias_ = (None) + hidden_states_100 = torch._C._nn.gelu(hidden_states_99) + hidden_states_99 = None + hidden_states_101 = torch._C._nn.linear( + hidden_states_100, + l_self_modules_encoder_modules_layer_modules_12_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_12_modules_output_modules_dense_parameters_bias_, + ) + hidden_states_100 = l_self_modules_encoder_modules_layer_modules_12_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_12_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_102 = torch.nn.functional.dropout( + hidden_states_101, 0.0, False, False + ) + hidden_states_101 = None + add_26 = hidden_states_102 + hidden_states_98 + hidden_states_102 = hidden_states_98 = None + hidden_states_103 = torch.nn.functional.layer_norm( + add_26, + (1024,), + l_self_modules_encoder_modules_layer_modules_12_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_12_modules_output_modules_layer_norm_parameters_bias_, + 1e-12, + ) + add_26 = l_self_modules_encoder_modules_layer_modules_12_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_12_modules_output_modules_layer_norm_parameters_bias_ = (None) + linear_78 = torch._C._nn.linear( + hidden_states_103, + l_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_self_modules_query_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_self_modules_query_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_self_modules_query_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_self_modules_query_parameters_bias_ = (None) + view_39 = linear_78.view(1, -1, 16, 64) + linear_78 = None + query_layer_13 = view_39.transpose(1, 2) + view_39 = None + linear_79 = torch._C._nn.linear( + hidden_states_103, + l_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_self_modules_key_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_self_modules_key_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_self_modules_key_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_self_modules_key_parameters_bias_ = (None) + view_40 = linear_79.view(1, -1, 16, 64) + linear_79 = None + key_layer_13 = view_40.transpose(1, 2) + view_40 = None + linear_80 = torch._C._nn.linear( + hidden_states_103, + l_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_self_modules_value_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_self_modules_value_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_self_modules_value_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_self_modules_value_parameters_bias_ = (None) + view_41 = linear_80.view(1, -1, 16, 64) + linear_80 = None + value_layer_13 = view_41.transpose(1, 2) + view_41 = None + attn_output_39 = torch._C._nn.scaled_dot_product_attention( + query_layer_13, + key_layer_13, + value_layer_13, + attn_mask=extended_attention_mask, + dropout_p=0.0, + is_causal=False, + ) + query_layer_13 = key_layer_13 = value_layer_13 = None + attn_output_40 = attn_output_39.transpose(1, 2) + attn_output_39 = None + attn_output_41 = attn_output_40.reshape(1, 11, 1024) + attn_output_40 = None + hidden_states_104 = torch._C._nn.linear( + attn_output_41, + l_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_output_modules_dense_parameters_bias_, + ) + attn_output_41 = l_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_105 = torch.nn.functional.dropout( + hidden_states_104, 0.0, False, False + ) + hidden_states_104 = None + add_27 = hidden_states_105 + hidden_states_103 + hidden_states_105 = hidden_states_103 = None + hidden_states_106 = torch.nn.functional.layer_norm( + add_27, + (1024,), + l_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_output_modules_layer_norm_parameters_bias_, + 1e-12, + ) + add_27 = l_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = (None) + hidden_states_107 = torch._C._nn.linear( + hidden_states_106, + l_self_modules_encoder_modules_layer_modules_13_modules_intermediate_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_13_modules_intermediate_modules_dense_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_13_modules_intermediate_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_13_modules_intermediate_modules_dense_parameters_bias_ = (None) + hidden_states_108 = torch._C._nn.gelu(hidden_states_107) + hidden_states_107 = None + hidden_states_109 = torch._C._nn.linear( + hidden_states_108, + l_self_modules_encoder_modules_layer_modules_13_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_13_modules_output_modules_dense_parameters_bias_, + ) + hidden_states_108 = l_self_modules_encoder_modules_layer_modules_13_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_13_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_110 = torch.nn.functional.dropout( + hidden_states_109, 0.0, False, False + ) + hidden_states_109 = None + add_28 = hidden_states_110 + hidden_states_106 + hidden_states_110 = hidden_states_106 = None + hidden_states_111 = torch.nn.functional.layer_norm( + add_28, + (1024,), + l_self_modules_encoder_modules_layer_modules_13_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_13_modules_output_modules_layer_norm_parameters_bias_, + 1e-12, + ) + add_28 = l_self_modules_encoder_modules_layer_modules_13_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_13_modules_output_modules_layer_norm_parameters_bias_ = (None) + linear_84 = torch._C._nn.linear( + hidden_states_111, + l_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_self_modules_query_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_self_modules_query_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_self_modules_query_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_self_modules_query_parameters_bias_ = (None) + view_42 = linear_84.view(1, -1, 16, 64) + linear_84 = None + query_layer_14 = view_42.transpose(1, 2) + view_42 = None + linear_85 = torch._C._nn.linear( + hidden_states_111, + l_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_self_modules_key_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_self_modules_key_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_self_modules_key_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_self_modules_key_parameters_bias_ = (None) + view_43 = linear_85.view(1, -1, 16, 64) + linear_85 = None + key_layer_14 = view_43.transpose(1, 2) + view_43 = None + linear_86 = torch._C._nn.linear( + hidden_states_111, + l_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_self_modules_value_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_self_modules_value_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_self_modules_value_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_self_modules_value_parameters_bias_ = (None) + view_44 = linear_86.view(1, -1, 16, 64) + linear_86 = None + value_layer_14 = view_44.transpose(1, 2) + view_44 = None + attn_output_42 = torch._C._nn.scaled_dot_product_attention( + query_layer_14, + key_layer_14, + value_layer_14, + attn_mask=extended_attention_mask, + dropout_p=0.0, + is_causal=False, + ) + query_layer_14 = key_layer_14 = value_layer_14 = None + attn_output_43 = attn_output_42.transpose(1, 2) + attn_output_42 = None + attn_output_44 = attn_output_43.reshape(1, 11, 1024) + attn_output_43 = None + hidden_states_112 = torch._C._nn.linear( + attn_output_44, + l_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_output_modules_dense_parameters_bias_, + ) + attn_output_44 = l_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_113 = torch.nn.functional.dropout( + hidden_states_112, 0.0, False, False + ) + hidden_states_112 = None + add_29 = hidden_states_113 + hidden_states_111 + hidden_states_113 = hidden_states_111 = None + hidden_states_114 = torch.nn.functional.layer_norm( + add_29, + (1024,), + l_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_output_modules_layer_norm_parameters_bias_, + 1e-12, + ) + add_29 = l_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = (None) + hidden_states_115 = torch._C._nn.linear( + hidden_states_114, + l_self_modules_encoder_modules_layer_modules_14_modules_intermediate_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_14_modules_intermediate_modules_dense_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_14_modules_intermediate_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_14_modules_intermediate_modules_dense_parameters_bias_ = (None) + hidden_states_116 = torch._C._nn.gelu(hidden_states_115) + hidden_states_115 = None + hidden_states_117 = torch._C._nn.linear( + hidden_states_116, + l_self_modules_encoder_modules_layer_modules_14_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_14_modules_output_modules_dense_parameters_bias_, + ) + hidden_states_116 = l_self_modules_encoder_modules_layer_modules_14_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_14_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_118 = torch.nn.functional.dropout( + hidden_states_117, 0.0, False, False + ) + hidden_states_117 = None + add_30 = hidden_states_118 + hidden_states_114 + hidden_states_118 = hidden_states_114 = None + hidden_states_119 = torch.nn.functional.layer_norm( + add_30, + (1024,), + l_self_modules_encoder_modules_layer_modules_14_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_14_modules_output_modules_layer_norm_parameters_bias_, + 1e-12, + ) + add_30 = l_self_modules_encoder_modules_layer_modules_14_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_14_modules_output_modules_layer_norm_parameters_bias_ = (None) + linear_90 = torch._C._nn.linear( + hidden_states_119, + l_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_self_modules_query_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_self_modules_query_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_self_modules_query_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_self_modules_query_parameters_bias_ = (None) + view_45 = linear_90.view(1, -1, 16, 64) + linear_90 = None + query_layer_15 = view_45.transpose(1, 2) + view_45 = None + linear_91 = torch._C._nn.linear( + hidden_states_119, + l_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_self_modules_key_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_self_modules_key_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_self_modules_key_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_self_modules_key_parameters_bias_ = (None) + view_46 = linear_91.view(1, -1, 16, 64) + linear_91 = None + key_layer_15 = view_46.transpose(1, 2) + view_46 = None + linear_92 = torch._C._nn.linear( + hidden_states_119, + l_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_self_modules_value_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_self_modules_value_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_self_modules_value_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_self_modules_value_parameters_bias_ = (None) + view_47 = linear_92.view(1, -1, 16, 64) + linear_92 = None + value_layer_15 = view_47.transpose(1, 2) + view_47 = None + attn_output_45 = torch._C._nn.scaled_dot_product_attention( + query_layer_15, + key_layer_15, + value_layer_15, + attn_mask=extended_attention_mask, + dropout_p=0.0, + is_causal=False, + ) + query_layer_15 = key_layer_15 = value_layer_15 = None + attn_output_46 = attn_output_45.transpose(1, 2) + attn_output_45 = None + attn_output_47 = attn_output_46.reshape(1, 11, 1024) + attn_output_46 = None + hidden_states_120 = torch._C._nn.linear( + attn_output_47, + l_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_output_modules_dense_parameters_bias_, + ) + attn_output_47 = l_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_121 = torch.nn.functional.dropout( + hidden_states_120, 0.0, False, False + ) + hidden_states_120 = None + add_31 = hidden_states_121 + hidden_states_119 + hidden_states_121 = hidden_states_119 = None + hidden_states_122 = torch.nn.functional.layer_norm( + add_31, + (1024,), + l_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_output_modules_layer_norm_parameters_bias_, + 1e-12, + ) + add_31 = l_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = (None) + hidden_states_123 = torch._C._nn.linear( + hidden_states_122, + l_self_modules_encoder_modules_layer_modules_15_modules_intermediate_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_15_modules_intermediate_modules_dense_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_15_modules_intermediate_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_15_modules_intermediate_modules_dense_parameters_bias_ = (None) + hidden_states_124 = torch._C._nn.gelu(hidden_states_123) + hidden_states_123 = None + hidden_states_125 = torch._C._nn.linear( + hidden_states_124, + l_self_modules_encoder_modules_layer_modules_15_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_15_modules_output_modules_dense_parameters_bias_, + ) + hidden_states_124 = l_self_modules_encoder_modules_layer_modules_15_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_15_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_126 = torch.nn.functional.dropout( + hidden_states_125, 0.0, False, False + ) + hidden_states_125 = None + add_32 = hidden_states_126 + hidden_states_122 + hidden_states_126 = hidden_states_122 = None + hidden_states_127 = torch.nn.functional.layer_norm( + add_32, + (1024,), + l_self_modules_encoder_modules_layer_modules_15_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_15_modules_output_modules_layer_norm_parameters_bias_, + 1e-12, + ) + add_32 = l_self_modules_encoder_modules_layer_modules_15_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_15_modules_output_modules_layer_norm_parameters_bias_ = (None) + linear_96 = torch._C._nn.linear( + hidden_states_127, + l_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_self_modules_query_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_self_modules_query_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_self_modules_query_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_self_modules_query_parameters_bias_ = (None) + view_48 = linear_96.view(1, -1, 16, 64) + linear_96 = None + query_layer_16 = view_48.transpose(1, 2) + view_48 = None + linear_97 = torch._C._nn.linear( + hidden_states_127, + l_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_self_modules_key_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_self_modules_key_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_self_modules_key_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_self_modules_key_parameters_bias_ = (None) + view_49 = linear_97.view(1, -1, 16, 64) + linear_97 = None + key_layer_16 = view_49.transpose(1, 2) + view_49 = None + linear_98 = torch._C._nn.linear( + hidden_states_127, + l_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_self_modules_value_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_self_modules_value_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_self_modules_value_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_self_modules_value_parameters_bias_ = (None) + view_50 = linear_98.view(1, -1, 16, 64) + linear_98 = None + value_layer_16 = view_50.transpose(1, 2) + view_50 = None + attn_output_48 = torch._C._nn.scaled_dot_product_attention( + query_layer_16, + key_layer_16, + value_layer_16, + attn_mask=extended_attention_mask, + dropout_p=0.0, + is_causal=False, + ) + query_layer_16 = key_layer_16 = value_layer_16 = None + attn_output_49 = attn_output_48.transpose(1, 2) + attn_output_48 = None + attn_output_50 = attn_output_49.reshape(1, 11, 1024) + attn_output_49 = None + hidden_states_128 = torch._C._nn.linear( + attn_output_50, + l_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_output_modules_dense_parameters_bias_, + ) + attn_output_50 = l_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_129 = torch.nn.functional.dropout( + hidden_states_128, 0.0, False, False + ) + hidden_states_128 = None + add_33 = hidden_states_129 + hidden_states_127 + hidden_states_129 = hidden_states_127 = None + hidden_states_130 = torch.nn.functional.layer_norm( + add_33, + (1024,), + l_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_output_modules_layer_norm_parameters_bias_, + 1e-12, + ) + add_33 = l_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = (None) + hidden_states_131 = torch._C._nn.linear( + hidden_states_130, + l_self_modules_encoder_modules_layer_modules_16_modules_intermediate_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_16_modules_intermediate_modules_dense_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_16_modules_intermediate_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_16_modules_intermediate_modules_dense_parameters_bias_ = (None) + hidden_states_132 = torch._C._nn.gelu(hidden_states_131) + hidden_states_131 = None + hidden_states_133 = torch._C._nn.linear( + hidden_states_132, + l_self_modules_encoder_modules_layer_modules_16_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_16_modules_output_modules_dense_parameters_bias_, + ) + hidden_states_132 = l_self_modules_encoder_modules_layer_modules_16_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_16_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_134 = torch.nn.functional.dropout( + hidden_states_133, 0.0, False, False + ) + hidden_states_133 = None + add_34 = hidden_states_134 + hidden_states_130 + hidden_states_134 = hidden_states_130 = None + hidden_states_135 = torch.nn.functional.layer_norm( + add_34, + (1024,), + l_self_modules_encoder_modules_layer_modules_16_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_16_modules_output_modules_layer_norm_parameters_bias_, + 1e-12, + ) + add_34 = l_self_modules_encoder_modules_layer_modules_16_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_16_modules_output_modules_layer_norm_parameters_bias_ = (None) + linear_102 = torch._C._nn.linear( + hidden_states_135, + l_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_self_modules_query_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_self_modules_query_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_self_modules_query_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_self_modules_query_parameters_bias_ = (None) + view_51 = linear_102.view(1, -1, 16, 64) + linear_102 = None + query_layer_17 = view_51.transpose(1, 2) + view_51 = None + linear_103 = torch._C._nn.linear( + hidden_states_135, + l_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_self_modules_key_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_self_modules_key_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_self_modules_key_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_self_modules_key_parameters_bias_ = (None) + view_52 = linear_103.view(1, -1, 16, 64) + linear_103 = None + key_layer_17 = view_52.transpose(1, 2) + view_52 = None + linear_104 = torch._C._nn.linear( + hidden_states_135, + l_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_self_modules_value_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_self_modules_value_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_self_modules_value_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_self_modules_value_parameters_bias_ = (None) + view_53 = linear_104.view(1, -1, 16, 64) + linear_104 = None + value_layer_17 = view_53.transpose(1, 2) + view_53 = None + attn_output_51 = torch._C._nn.scaled_dot_product_attention( + query_layer_17, + key_layer_17, + value_layer_17, + attn_mask=extended_attention_mask, + dropout_p=0.0, + is_causal=False, + ) + query_layer_17 = key_layer_17 = value_layer_17 = None + attn_output_52 = attn_output_51.transpose(1, 2) + attn_output_51 = None + attn_output_53 = attn_output_52.reshape(1, 11, 1024) + attn_output_52 = None + hidden_states_136 = torch._C._nn.linear( + attn_output_53, + l_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_output_modules_dense_parameters_bias_, + ) + attn_output_53 = l_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_137 = torch.nn.functional.dropout( + hidden_states_136, 0.0, False, False + ) + hidden_states_136 = None + add_35 = hidden_states_137 + hidden_states_135 + hidden_states_137 = hidden_states_135 = None + hidden_states_138 = torch.nn.functional.layer_norm( + add_35, + (1024,), + l_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_output_modules_layer_norm_parameters_bias_, + 1e-12, + ) + add_35 = l_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = (None) + hidden_states_139 = torch._C._nn.linear( + hidden_states_138, + l_self_modules_encoder_modules_layer_modules_17_modules_intermediate_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_17_modules_intermediate_modules_dense_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_17_modules_intermediate_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_17_modules_intermediate_modules_dense_parameters_bias_ = (None) + hidden_states_140 = torch._C._nn.gelu(hidden_states_139) + hidden_states_139 = None + hidden_states_141 = torch._C._nn.linear( + hidden_states_140, + l_self_modules_encoder_modules_layer_modules_17_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_17_modules_output_modules_dense_parameters_bias_, + ) + hidden_states_140 = l_self_modules_encoder_modules_layer_modules_17_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_17_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_142 = torch.nn.functional.dropout( + hidden_states_141, 0.0, False, False + ) + hidden_states_141 = None + add_36 = hidden_states_142 + hidden_states_138 + hidden_states_142 = hidden_states_138 = None + hidden_states_143 = torch.nn.functional.layer_norm( + add_36, + (1024,), + l_self_modules_encoder_modules_layer_modules_17_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_17_modules_output_modules_layer_norm_parameters_bias_, + 1e-12, + ) + add_36 = l_self_modules_encoder_modules_layer_modules_17_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_17_modules_output_modules_layer_norm_parameters_bias_ = (None) + linear_108 = torch._C._nn.linear( + hidden_states_143, + l_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_self_modules_query_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_self_modules_query_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_self_modules_query_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_self_modules_query_parameters_bias_ = (None) + view_54 = linear_108.view(1, -1, 16, 64) + linear_108 = None + query_layer_18 = view_54.transpose(1, 2) + view_54 = None + linear_109 = torch._C._nn.linear( + hidden_states_143, + l_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_self_modules_key_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_self_modules_key_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_self_modules_key_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_self_modules_key_parameters_bias_ = (None) + view_55 = linear_109.view(1, -1, 16, 64) + linear_109 = None + key_layer_18 = view_55.transpose(1, 2) + view_55 = None + linear_110 = torch._C._nn.linear( + hidden_states_143, + l_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_self_modules_value_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_self_modules_value_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_self_modules_value_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_self_modules_value_parameters_bias_ = (None) + view_56 = linear_110.view(1, -1, 16, 64) + linear_110 = None + value_layer_18 = view_56.transpose(1, 2) + view_56 = None + attn_output_54 = torch._C._nn.scaled_dot_product_attention( + query_layer_18, + key_layer_18, + value_layer_18, + attn_mask=extended_attention_mask, + dropout_p=0.0, + is_causal=False, + ) + query_layer_18 = key_layer_18 = value_layer_18 = None + attn_output_55 = attn_output_54.transpose(1, 2) + attn_output_54 = None + attn_output_56 = attn_output_55.reshape(1, 11, 1024) + attn_output_55 = None + hidden_states_144 = torch._C._nn.linear( + attn_output_56, + l_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_output_modules_dense_parameters_bias_, + ) + attn_output_56 = l_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_145 = torch.nn.functional.dropout( + hidden_states_144, 0.0, False, False + ) + hidden_states_144 = None + add_37 = hidden_states_145 + hidden_states_143 + hidden_states_145 = hidden_states_143 = None + hidden_states_146 = torch.nn.functional.layer_norm( + add_37, + (1024,), + l_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_output_modules_layer_norm_parameters_bias_, + 1e-12, + ) + add_37 = l_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = (None) + hidden_states_147 = torch._C._nn.linear( + hidden_states_146, + l_self_modules_encoder_modules_layer_modules_18_modules_intermediate_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_18_modules_intermediate_modules_dense_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_18_modules_intermediate_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_18_modules_intermediate_modules_dense_parameters_bias_ = (None) + hidden_states_148 = torch._C._nn.gelu(hidden_states_147) + hidden_states_147 = None + hidden_states_149 = torch._C._nn.linear( + hidden_states_148, + l_self_modules_encoder_modules_layer_modules_18_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_18_modules_output_modules_dense_parameters_bias_, + ) + hidden_states_148 = l_self_modules_encoder_modules_layer_modules_18_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_18_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_150 = torch.nn.functional.dropout( + hidden_states_149, 0.0, False, False + ) + hidden_states_149 = None + add_38 = hidden_states_150 + hidden_states_146 + hidden_states_150 = hidden_states_146 = None + hidden_states_151 = torch.nn.functional.layer_norm( + add_38, + (1024,), + l_self_modules_encoder_modules_layer_modules_18_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_18_modules_output_modules_layer_norm_parameters_bias_, + 1e-12, + ) + add_38 = l_self_modules_encoder_modules_layer_modules_18_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_18_modules_output_modules_layer_norm_parameters_bias_ = (None) + linear_114 = torch._C._nn.linear( + hidden_states_151, + l_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_self_modules_query_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_self_modules_query_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_self_modules_query_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_self_modules_query_parameters_bias_ = (None) + view_57 = linear_114.view(1, -1, 16, 64) + linear_114 = None + query_layer_19 = view_57.transpose(1, 2) + view_57 = None + linear_115 = torch._C._nn.linear( + hidden_states_151, + l_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_self_modules_key_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_self_modules_key_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_self_modules_key_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_self_modules_key_parameters_bias_ = (None) + view_58 = linear_115.view(1, -1, 16, 64) + linear_115 = None + key_layer_19 = view_58.transpose(1, 2) + view_58 = None + linear_116 = torch._C._nn.linear( + hidden_states_151, + l_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_self_modules_value_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_self_modules_value_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_self_modules_value_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_self_modules_value_parameters_bias_ = (None) + view_59 = linear_116.view(1, -1, 16, 64) + linear_116 = None + value_layer_19 = view_59.transpose(1, 2) + view_59 = None + attn_output_57 = torch._C._nn.scaled_dot_product_attention( + query_layer_19, + key_layer_19, + value_layer_19, + attn_mask=extended_attention_mask, + dropout_p=0.0, + is_causal=False, + ) + query_layer_19 = key_layer_19 = value_layer_19 = None + attn_output_58 = attn_output_57.transpose(1, 2) + attn_output_57 = None + attn_output_59 = attn_output_58.reshape(1, 11, 1024) + attn_output_58 = None + hidden_states_152 = torch._C._nn.linear( + attn_output_59, + l_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_output_modules_dense_parameters_bias_, + ) + attn_output_59 = l_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_153 = torch.nn.functional.dropout( + hidden_states_152, 0.0, False, False + ) + hidden_states_152 = None + add_39 = hidden_states_153 + hidden_states_151 + hidden_states_153 = hidden_states_151 = None + hidden_states_154 = torch.nn.functional.layer_norm( + add_39, + (1024,), + l_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_output_modules_layer_norm_parameters_bias_, + 1e-12, + ) + add_39 = l_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = (None) + hidden_states_155 = torch._C._nn.linear( + hidden_states_154, + l_self_modules_encoder_modules_layer_modules_19_modules_intermediate_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_19_modules_intermediate_modules_dense_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_19_modules_intermediate_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_19_modules_intermediate_modules_dense_parameters_bias_ = (None) + hidden_states_156 = torch._C._nn.gelu(hidden_states_155) + hidden_states_155 = None + hidden_states_157 = torch._C._nn.linear( + hidden_states_156, + l_self_modules_encoder_modules_layer_modules_19_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_19_modules_output_modules_dense_parameters_bias_, + ) + hidden_states_156 = l_self_modules_encoder_modules_layer_modules_19_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_19_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_158 = torch.nn.functional.dropout( + hidden_states_157, 0.0, False, False + ) + hidden_states_157 = None + add_40 = hidden_states_158 + hidden_states_154 + hidden_states_158 = hidden_states_154 = None + hidden_states_159 = torch.nn.functional.layer_norm( + add_40, + (1024,), + l_self_modules_encoder_modules_layer_modules_19_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_19_modules_output_modules_layer_norm_parameters_bias_, + 1e-12, + ) + add_40 = l_self_modules_encoder_modules_layer_modules_19_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_19_modules_output_modules_layer_norm_parameters_bias_ = (None) + linear_120 = torch._C._nn.linear( + hidden_states_159, + l_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_self_modules_query_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_self_modules_query_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_self_modules_query_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_self_modules_query_parameters_bias_ = (None) + view_60 = linear_120.view(1, -1, 16, 64) + linear_120 = None + query_layer_20 = view_60.transpose(1, 2) + view_60 = None + linear_121 = torch._C._nn.linear( + hidden_states_159, + l_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_self_modules_key_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_self_modules_key_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_self_modules_key_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_self_modules_key_parameters_bias_ = (None) + view_61 = linear_121.view(1, -1, 16, 64) + linear_121 = None + key_layer_20 = view_61.transpose(1, 2) + view_61 = None + linear_122 = torch._C._nn.linear( + hidden_states_159, + l_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_self_modules_value_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_self_modules_value_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_self_modules_value_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_self_modules_value_parameters_bias_ = (None) + view_62 = linear_122.view(1, -1, 16, 64) + linear_122 = None + value_layer_20 = view_62.transpose(1, 2) + view_62 = None + attn_output_60 = torch._C._nn.scaled_dot_product_attention( + query_layer_20, + key_layer_20, + value_layer_20, + attn_mask=extended_attention_mask, + dropout_p=0.0, + is_causal=False, + ) + query_layer_20 = key_layer_20 = value_layer_20 = None + attn_output_61 = attn_output_60.transpose(1, 2) + attn_output_60 = None + attn_output_62 = attn_output_61.reshape(1, 11, 1024) + attn_output_61 = None + hidden_states_160 = torch._C._nn.linear( + attn_output_62, + l_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_output_modules_dense_parameters_bias_, + ) + attn_output_62 = l_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_161 = torch.nn.functional.dropout( + hidden_states_160, 0.0, False, False + ) + hidden_states_160 = None + add_41 = hidden_states_161 + hidden_states_159 + hidden_states_161 = hidden_states_159 = None + hidden_states_162 = torch.nn.functional.layer_norm( + add_41, + (1024,), + l_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_output_modules_layer_norm_parameters_bias_, + 1e-12, + ) + add_41 = l_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = (None) + hidden_states_163 = torch._C._nn.linear( + hidden_states_162, + l_self_modules_encoder_modules_layer_modules_20_modules_intermediate_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_20_modules_intermediate_modules_dense_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_20_modules_intermediate_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_20_modules_intermediate_modules_dense_parameters_bias_ = (None) + hidden_states_164 = torch._C._nn.gelu(hidden_states_163) + hidden_states_163 = None + hidden_states_165 = torch._C._nn.linear( + hidden_states_164, + l_self_modules_encoder_modules_layer_modules_20_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_20_modules_output_modules_dense_parameters_bias_, + ) + hidden_states_164 = l_self_modules_encoder_modules_layer_modules_20_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_20_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_166 = torch.nn.functional.dropout( + hidden_states_165, 0.0, False, False + ) + hidden_states_165 = None + add_42 = hidden_states_166 + hidden_states_162 + hidden_states_166 = hidden_states_162 = None + hidden_states_167 = torch.nn.functional.layer_norm( + add_42, + (1024,), + l_self_modules_encoder_modules_layer_modules_20_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_20_modules_output_modules_layer_norm_parameters_bias_, + 1e-12, + ) + add_42 = l_self_modules_encoder_modules_layer_modules_20_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_20_modules_output_modules_layer_norm_parameters_bias_ = (None) + linear_126 = torch._C._nn.linear( + hidden_states_167, + l_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_self_modules_query_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_self_modules_query_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_self_modules_query_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_self_modules_query_parameters_bias_ = (None) + view_63 = linear_126.view(1, -1, 16, 64) + linear_126 = None + query_layer_21 = view_63.transpose(1, 2) + view_63 = None + linear_127 = torch._C._nn.linear( + hidden_states_167, + l_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_self_modules_key_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_self_modules_key_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_self_modules_key_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_self_modules_key_parameters_bias_ = (None) + view_64 = linear_127.view(1, -1, 16, 64) + linear_127 = None + key_layer_21 = view_64.transpose(1, 2) + view_64 = None + linear_128 = torch._C._nn.linear( + hidden_states_167, + l_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_self_modules_value_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_self_modules_value_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_self_modules_value_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_self_modules_value_parameters_bias_ = (None) + view_65 = linear_128.view(1, -1, 16, 64) + linear_128 = None + value_layer_21 = view_65.transpose(1, 2) + view_65 = None + attn_output_63 = torch._C._nn.scaled_dot_product_attention( + query_layer_21, + key_layer_21, + value_layer_21, + attn_mask=extended_attention_mask, + dropout_p=0.0, + is_causal=False, + ) + query_layer_21 = key_layer_21 = value_layer_21 = None + attn_output_64 = attn_output_63.transpose(1, 2) + attn_output_63 = None + attn_output_65 = attn_output_64.reshape(1, 11, 1024) + attn_output_64 = None + hidden_states_168 = torch._C._nn.linear( + attn_output_65, + l_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_output_modules_dense_parameters_bias_, + ) + attn_output_65 = l_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_169 = torch.nn.functional.dropout( + hidden_states_168, 0.0, False, False + ) + hidden_states_168 = None + add_43 = hidden_states_169 + hidden_states_167 + hidden_states_169 = hidden_states_167 = None + hidden_states_170 = torch.nn.functional.layer_norm( + add_43, + (1024,), + l_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_output_modules_layer_norm_parameters_bias_, + 1e-12, + ) + add_43 = l_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = (None) + hidden_states_171 = torch._C._nn.linear( + hidden_states_170, + l_self_modules_encoder_modules_layer_modules_21_modules_intermediate_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_21_modules_intermediate_modules_dense_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_21_modules_intermediate_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_21_modules_intermediate_modules_dense_parameters_bias_ = (None) + hidden_states_172 = torch._C._nn.gelu(hidden_states_171) + hidden_states_171 = None + hidden_states_173 = torch._C._nn.linear( + hidden_states_172, + l_self_modules_encoder_modules_layer_modules_21_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_21_modules_output_modules_dense_parameters_bias_, + ) + hidden_states_172 = l_self_modules_encoder_modules_layer_modules_21_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_21_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_174 = torch.nn.functional.dropout( + hidden_states_173, 0.0, False, False + ) + hidden_states_173 = None + add_44 = hidden_states_174 + hidden_states_170 + hidden_states_174 = hidden_states_170 = None + hidden_states_175 = torch.nn.functional.layer_norm( + add_44, + (1024,), + l_self_modules_encoder_modules_layer_modules_21_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_21_modules_output_modules_layer_norm_parameters_bias_, + 1e-12, + ) + add_44 = l_self_modules_encoder_modules_layer_modules_21_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_21_modules_output_modules_layer_norm_parameters_bias_ = (None) + linear_132 = torch._C._nn.linear( + hidden_states_175, + l_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_self_modules_query_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_self_modules_query_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_self_modules_query_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_self_modules_query_parameters_bias_ = (None) + view_66 = linear_132.view(1, -1, 16, 64) + linear_132 = None + query_layer_22 = view_66.transpose(1, 2) + view_66 = None + linear_133 = torch._C._nn.linear( + hidden_states_175, + l_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_self_modules_key_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_self_modules_key_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_self_modules_key_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_self_modules_key_parameters_bias_ = (None) + view_67 = linear_133.view(1, -1, 16, 64) + linear_133 = None + key_layer_22 = view_67.transpose(1, 2) + view_67 = None + linear_134 = torch._C._nn.linear( + hidden_states_175, + l_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_self_modules_value_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_self_modules_value_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_self_modules_value_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_self_modules_value_parameters_bias_ = (None) + view_68 = linear_134.view(1, -1, 16, 64) + linear_134 = None + value_layer_22 = view_68.transpose(1, 2) + view_68 = None + attn_output_66 = torch._C._nn.scaled_dot_product_attention( + query_layer_22, + key_layer_22, + value_layer_22, + attn_mask=extended_attention_mask, + dropout_p=0.0, + is_causal=False, + ) + query_layer_22 = key_layer_22 = value_layer_22 = None + attn_output_67 = attn_output_66.transpose(1, 2) + attn_output_66 = None + attn_output_68 = attn_output_67.reshape(1, 11, 1024) + attn_output_67 = None + hidden_states_176 = torch._C._nn.linear( + attn_output_68, + l_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_output_modules_dense_parameters_bias_, + ) + attn_output_68 = l_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_177 = torch.nn.functional.dropout( + hidden_states_176, 0.0, False, False + ) + hidden_states_176 = None + add_45 = hidden_states_177 + hidden_states_175 + hidden_states_177 = hidden_states_175 = None + hidden_states_178 = torch.nn.functional.layer_norm( + add_45, + (1024,), + l_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_output_modules_layer_norm_parameters_bias_, + 1e-12, + ) + add_45 = l_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = (None) + hidden_states_179 = torch._C._nn.linear( + hidden_states_178, + l_self_modules_encoder_modules_layer_modules_22_modules_intermediate_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_22_modules_intermediate_modules_dense_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_22_modules_intermediate_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_22_modules_intermediate_modules_dense_parameters_bias_ = (None) + hidden_states_180 = torch._C._nn.gelu(hidden_states_179) + hidden_states_179 = None + hidden_states_181 = torch._C._nn.linear( + hidden_states_180, + l_self_modules_encoder_modules_layer_modules_22_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_22_modules_output_modules_dense_parameters_bias_, + ) + hidden_states_180 = l_self_modules_encoder_modules_layer_modules_22_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_22_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_182 = torch.nn.functional.dropout( + hidden_states_181, 0.0, False, False + ) + hidden_states_181 = None + add_46 = hidden_states_182 + hidden_states_178 + hidden_states_182 = hidden_states_178 = None + hidden_states_183 = torch.nn.functional.layer_norm( + add_46, + (1024,), + l_self_modules_encoder_modules_layer_modules_22_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_22_modules_output_modules_layer_norm_parameters_bias_, + 1e-12, + ) + add_46 = l_self_modules_encoder_modules_layer_modules_22_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_22_modules_output_modules_layer_norm_parameters_bias_ = (None) + linear_138 = torch._C._nn.linear( + hidden_states_183, + l_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_self_modules_query_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_self_modules_query_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_self_modules_query_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_self_modules_query_parameters_bias_ = (None) + view_69 = linear_138.view(1, -1, 16, 64) + linear_138 = None + query_layer_23 = view_69.transpose(1, 2) + view_69 = None + linear_139 = torch._C._nn.linear( + hidden_states_183, + l_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_self_modules_key_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_self_modules_key_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_self_modules_key_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_self_modules_key_parameters_bias_ = (None) + view_70 = linear_139.view(1, -1, 16, 64) + linear_139 = None + key_layer_23 = view_70.transpose(1, 2) + view_70 = None + linear_140 = torch._C._nn.linear( + hidden_states_183, + l_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_self_modules_value_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_self_modules_value_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_self_modules_value_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_self_modules_value_parameters_bias_ = (None) + view_71 = linear_140.view(1, -1, 16, 64) + linear_140 = None + value_layer_23 = view_71.transpose(1, 2) + view_71 = None + attn_output_69 = torch._C._nn.scaled_dot_product_attention( + query_layer_23, + key_layer_23, + value_layer_23, + attn_mask=extended_attention_mask, + dropout_p=0.0, + is_causal=False, + ) + query_layer_23 = key_layer_23 = value_layer_23 = extended_attention_mask = None + attn_output_70 = attn_output_69.transpose(1, 2) + attn_output_69 = None + attn_output_71 = attn_output_70.reshape(1, 11, 1024) + attn_output_70 = None + hidden_states_184 = torch._C._nn.linear( + attn_output_71, + l_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_output_modules_dense_parameters_bias_, + ) + attn_output_71 = l_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_185 = torch.nn.functional.dropout( + hidden_states_184, 0.0, False, False + ) + hidden_states_184 = None + add_47 = hidden_states_185 + hidden_states_183 + hidden_states_185 = hidden_states_183 = None + hidden_states_186 = torch.nn.functional.layer_norm( + add_47, + (1024,), + l_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_output_modules_layer_norm_parameters_bias_, + 1e-12, + ) + add_47 = l_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = (None) + hidden_states_187 = torch._C._nn.linear( + hidden_states_186, + l_self_modules_encoder_modules_layer_modules_23_modules_intermediate_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_23_modules_intermediate_modules_dense_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_23_modules_intermediate_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_23_modules_intermediate_modules_dense_parameters_bias_ = (None) + hidden_states_188 = torch._C._nn.gelu(hidden_states_187) + hidden_states_187 = None + hidden_states_189 = torch._C._nn.linear( + hidden_states_188, + l_self_modules_encoder_modules_layer_modules_23_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_23_modules_output_modules_dense_parameters_bias_, + ) + hidden_states_188 = l_self_modules_encoder_modules_layer_modules_23_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_23_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_190 = torch.nn.functional.dropout( + hidden_states_189, 0.0, False, False + ) + hidden_states_189 = None + add_48 = hidden_states_190 + hidden_states_186 + hidden_states_190 = hidden_states_186 = None + hidden_states_191 = torch.nn.functional.layer_norm( + add_48, + (1024,), + l_self_modules_encoder_modules_layer_modules_23_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_23_modules_output_modules_layer_norm_parameters_bias_, + 1e-12, + ) + add_48 = l_self_modules_encoder_modules_layer_modules_23_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_23_modules_output_modules_layer_norm_parameters_bias_ = (None) + first_token_tensor = hidden_states_191[(slice(None, None, None), 0)] + pooled_output = torch._C._nn.linear( + first_token_tensor, + l_self_modules_pooler_modules_dense_parameters_weight_, + l_self_modules_pooler_modules_dense_parameters_bias_, + ) + first_token_tensor = ( + l_self_modules_pooler_modules_dense_parameters_weight_ + ) = l_self_modules_pooler_modules_dense_parameters_bias_ = None + pooled_output_1 = torch.tanh(pooled_output) + pooled_output = None + return (hidden_states_191, pooled_output_1) diff --git a/samples/transformers-auto-model/Onutoa_1_6e-3_5_0.5/weight_meta.py b/samples/transformers-auto-model/Onutoa_1_6e-3_5_0.5/weight_meta.py new file mode 100644 index 000000000..b7d97ec90 --- /dev/null +++ b/samples/transformers-auto-model/Onutoa_1_6e-3_5_0.5/weight_meta.py @@ -0,0 +1,3949 @@ +class Program_weight_tensor_meta_L_input_ids_: + name = "L_input_ids_" + shape = [1, 11] + dtype = "torch.int64" + device = "cuda:0" + mean = None + std = None + data = [101, 10629, 7159, 2003, 2109, 2000, 14817, 15078, 19287, 1012, 102] + + +class Program_weight_tensor_meta_L_token_type_ids_: + name = "L_token_type_ids_" + shape = [1, 11] + dtype = "torch.int64" + device = "cuda:0" + mean = None + std = None + data = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] + + +class Program_weight_tensor_meta_L_self_modules_embeddings_buffers_position_ids_: + name = "L_self_modules_embeddings_buffers_position_ids_" + shape = [1, 512] + dtype = "torch.int64" + device = "cuda:0" + mean = None + std = None + min_val = 0 + max_val = 511 + + +class Program_weight_tensor_meta_L_self_modules_embeddings_modules_word_embeddings_parameters_weight_: + name = "L_self_modules_embeddings_modules_word_embeddings_parameters_weight_" + shape = [30522, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_embeddings_modules_token_type_embeddings_parameters_weight_: + name = "L_self_modules_embeddings_modules_token_type_embeddings_parameters_weight_" + shape = [2, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.021 + data = None + + +class Program_weight_tensor_meta_L_self_modules_embeddings_modules_position_embeddings_parameters_weight_: + name = "L_self_modules_embeddings_modules_position_embeddings_parameters_weight_" + shape = [512, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_embeddings_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_embeddings_modules_LayerNorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_embeddings_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_embeddings_modules_LayerNorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_attention_mask_: + name = "L_attention_mask_" + shape = [1, 11] + dtype = "torch.int64" + device = "cuda:0" + mean = None + std = None + data = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_query_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_query_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_query_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_query_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_key_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_key_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_key_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_key_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_value_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_value_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_value_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_value_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dense_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_LayerNorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_LayerNorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_intermediate_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_intermediate_modules_dense_parameters_weight_" + shape = [4096, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_intermediate_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_intermediate_modules_dense_parameters_bias_" + shape = [4096] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dense_parameters_weight_" + shape = [1024, 4096] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_LayerNorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_LayerNorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_query_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_query_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_query_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_query_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_key_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_key_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_key_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_key_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_value_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_value_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_value_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_value_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dense_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_LayerNorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_LayerNorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_intermediate_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_intermediate_modules_dense_parameters_weight_" + shape = [4096, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_intermediate_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_intermediate_modules_dense_parameters_bias_" + shape = [4096] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dense_parameters_weight_" + shape = [1024, 4096] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_LayerNorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_LayerNorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_query_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_query_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_query_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_query_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_key_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_key_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_key_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_key_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_value_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_value_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_value_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_value_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_dense_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_LayerNorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_LayerNorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_intermediate_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_intermediate_modules_dense_parameters_weight_" + shape = [4096, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_intermediate_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_intermediate_modules_dense_parameters_bias_" + shape = [4096] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_dense_parameters_weight_" + shape = [1024, 4096] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_LayerNorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_LayerNorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_query_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_query_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_query_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_query_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_key_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_key_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_key_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_key_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_value_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_value_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_value_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_value_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_dense_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_LayerNorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_LayerNorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_intermediate_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_intermediate_modules_dense_parameters_weight_" + shape = [4096, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_intermediate_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_intermediate_modules_dense_parameters_bias_" + shape = [4096] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_dense_parameters_weight_" + shape = [1024, 4096] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_LayerNorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_LayerNorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_query_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_query_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_query_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_query_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_key_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_key_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_key_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_key_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_value_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_value_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_value_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_value_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_dense_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_LayerNorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_LayerNorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_4_modules_intermediate_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_4_modules_intermediate_modules_dense_parameters_weight_" + shape = [4096, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_4_modules_intermediate_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_4_modules_intermediate_modules_dense_parameters_bias_" + shape = [4096] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_4_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_4_modules_output_modules_dense_parameters_weight_" + shape = [1024, 4096] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_4_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_4_modules_output_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_4_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_4_modules_output_modules_LayerNorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_4_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_4_modules_output_modules_LayerNorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_query_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_query_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_query_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_query_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_key_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_key_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_key_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_key_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_value_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_value_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_value_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_value_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_dense_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_LayerNorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_LayerNorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_5_modules_intermediate_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_5_modules_intermediate_modules_dense_parameters_weight_" + shape = [4096, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_5_modules_intermediate_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_5_modules_intermediate_modules_dense_parameters_bias_" + shape = [4096] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_5_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_5_modules_output_modules_dense_parameters_weight_" + shape = [1024, 4096] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_5_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_5_modules_output_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_5_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_5_modules_output_modules_LayerNorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_5_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_5_modules_output_modules_LayerNorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_query_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_query_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_query_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_query_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_key_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_key_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_key_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_key_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_value_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_value_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_value_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_value_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_dense_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_LayerNorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_LayerNorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_6_modules_intermediate_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_6_modules_intermediate_modules_dense_parameters_weight_" + shape = [4096, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_6_modules_intermediate_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_6_modules_intermediate_modules_dense_parameters_bias_" + shape = [4096] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_6_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_6_modules_output_modules_dense_parameters_weight_" + shape = [1024, 4096] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_6_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_6_modules_output_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_6_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_6_modules_output_modules_LayerNorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_6_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_6_modules_output_modules_LayerNorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_query_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_query_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_query_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_query_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_key_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_key_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_key_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_key_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_value_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_value_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_value_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_value_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_dense_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_LayerNorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_LayerNorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_7_modules_intermediate_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_7_modules_intermediate_modules_dense_parameters_weight_" + shape = [4096, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_7_modules_intermediate_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_7_modules_intermediate_modules_dense_parameters_bias_" + shape = [4096] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_7_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_7_modules_output_modules_dense_parameters_weight_" + shape = [1024, 4096] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_7_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_7_modules_output_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_7_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_7_modules_output_modules_LayerNorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_7_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_7_modules_output_modules_LayerNorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_query_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_query_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_query_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_query_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_key_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_key_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_key_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_key_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_value_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_value_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_value_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_value_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_dense_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_LayerNorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_LayerNorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_8_modules_intermediate_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_8_modules_intermediate_modules_dense_parameters_weight_" + shape = [4096, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_8_modules_intermediate_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_8_modules_intermediate_modules_dense_parameters_bias_" + shape = [4096] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_8_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_8_modules_output_modules_dense_parameters_weight_" + shape = [1024, 4096] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_8_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_8_modules_output_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_8_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_8_modules_output_modules_LayerNorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_8_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_8_modules_output_modules_LayerNorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_query_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_query_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_query_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_query_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_key_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_key_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_key_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_key_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_value_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_value_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_value_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_value_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_dense_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_LayerNorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_LayerNorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_9_modules_intermediate_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_9_modules_intermediate_modules_dense_parameters_weight_" + shape = [4096, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_9_modules_intermediate_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_9_modules_intermediate_modules_dense_parameters_bias_" + shape = [4096] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_9_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_9_modules_output_modules_dense_parameters_weight_" + shape = [1024, 4096] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_9_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_9_modules_output_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_9_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_9_modules_output_modules_LayerNorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_9_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_9_modules_output_modules_LayerNorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_query_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_query_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_query_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_query_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_key_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_key_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_key_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_key_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_value_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_value_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_value_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_value_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_dense_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_LayerNorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_LayerNorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_10_modules_intermediate_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_10_modules_intermediate_modules_dense_parameters_weight_" + shape = [4096, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_10_modules_intermediate_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_10_modules_intermediate_modules_dense_parameters_bias_" + shape = [4096] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_10_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_10_modules_output_modules_dense_parameters_weight_" + shape = [1024, 4096] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_10_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_10_modules_output_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_10_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_10_modules_output_modules_LayerNorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_10_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_10_modules_output_modules_LayerNorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_query_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_query_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_query_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_query_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_key_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_key_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_key_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_key_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_value_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_value_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_value_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_value_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_dense_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_LayerNorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_LayerNorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_11_modules_intermediate_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_11_modules_intermediate_modules_dense_parameters_weight_" + shape = [4096, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_11_modules_intermediate_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_11_modules_intermediate_modules_dense_parameters_bias_" + shape = [4096] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_11_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_11_modules_output_modules_dense_parameters_weight_" + shape = [1024, 4096] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_11_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_11_modules_output_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_11_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_11_modules_output_modules_LayerNorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_11_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_11_modules_output_modules_LayerNorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_self_modules_query_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_self_modules_query_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_self_modules_query_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_self_modules_query_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_self_modules_key_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_self_modules_key_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_self_modules_key_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_self_modules_key_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_self_modules_value_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_self_modules_value_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_self_modules_value_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_self_modules_value_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_output_modules_dense_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_output_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_output_modules_LayerNorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_output_modules_LayerNorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_12_modules_intermediate_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_12_modules_intermediate_modules_dense_parameters_weight_" + shape = [4096, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_12_modules_intermediate_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_12_modules_intermediate_modules_dense_parameters_bias_" + shape = [4096] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_12_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_12_modules_output_modules_dense_parameters_weight_" + shape = [1024, 4096] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_12_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_12_modules_output_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_12_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_12_modules_output_modules_LayerNorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_12_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_12_modules_output_modules_LayerNorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_self_modules_query_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_self_modules_query_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_self_modules_query_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_self_modules_query_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_self_modules_key_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_self_modules_key_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_self_modules_key_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_self_modules_key_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_self_modules_value_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_self_modules_value_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_self_modules_value_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_self_modules_value_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_output_modules_dense_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_output_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_output_modules_LayerNorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_output_modules_LayerNorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_13_modules_intermediate_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_13_modules_intermediate_modules_dense_parameters_weight_" + shape = [4096, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_13_modules_intermediate_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_13_modules_intermediate_modules_dense_parameters_bias_" + shape = [4096] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_13_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_13_modules_output_modules_dense_parameters_weight_" + shape = [1024, 4096] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_13_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_13_modules_output_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_13_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_13_modules_output_modules_LayerNorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_13_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_13_modules_output_modules_LayerNorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_self_modules_query_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_self_modules_query_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_self_modules_query_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_self_modules_query_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_self_modules_key_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_self_modules_key_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_self_modules_key_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_self_modules_key_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_self_modules_value_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_self_modules_value_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_self_modules_value_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_self_modules_value_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_output_modules_dense_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_output_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_output_modules_LayerNorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_output_modules_LayerNorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_14_modules_intermediate_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_14_modules_intermediate_modules_dense_parameters_weight_" + shape = [4096, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_14_modules_intermediate_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_14_modules_intermediate_modules_dense_parameters_bias_" + shape = [4096] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_14_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_14_modules_output_modules_dense_parameters_weight_" + shape = [1024, 4096] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_14_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_14_modules_output_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_14_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_14_modules_output_modules_LayerNorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_14_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_14_modules_output_modules_LayerNorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_self_modules_query_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_self_modules_query_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_self_modules_query_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_self_modules_query_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_self_modules_key_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_self_modules_key_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_self_modules_key_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_self_modules_key_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_self_modules_value_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_self_modules_value_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_self_modules_value_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_self_modules_value_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_output_modules_dense_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_output_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_output_modules_LayerNorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_output_modules_LayerNorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_15_modules_intermediate_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_15_modules_intermediate_modules_dense_parameters_weight_" + shape = [4096, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_15_modules_intermediate_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_15_modules_intermediate_modules_dense_parameters_bias_" + shape = [4096] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_15_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_15_modules_output_modules_dense_parameters_weight_" + shape = [1024, 4096] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_15_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_15_modules_output_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_15_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_15_modules_output_modules_LayerNorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_15_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_15_modules_output_modules_LayerNorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_self_modules_query_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_self_modules_query_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_self_modules_query_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_self_modules_query_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_self_modules_key_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_self_modules_key_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_self_modules_key_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_self_modules_key_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_self_modules_value_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_self_modules_value_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_self_modules_value_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_self_modules_value_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_output_modules_dense_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_output_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_output_modules_LayerNorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_output_modules_LayerNorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_16_modules_intermediate_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_16_modules_intermediate_modules_dense_parameters_weight_" + shape = [4096, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_16_modules_intermediate_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_16_modules_intermediate_modules_dense_parameters_bias_" + shape = [4096] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_16_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_16_modules_output_modules_dense_parameters_weight_" + shape = [1024, 4096] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_16_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_16_modules_output_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_16_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_16_modules_output_modules_LayerNorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_16_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_16_modules_output_modules_LayerNorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_self_modules_query_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_self_modules_query_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_self_modules_query_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_self_modules_query_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_self_modules_key_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_self_modules_key_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_self_modules_key_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_self_modules_key_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_self_modules_value_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_self_modules_value_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_self_modules_value_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_self_modules_value_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_output_modules_dense_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_output_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_output_modules_LayerNorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_output_modules_LayerNorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_17_modules_intermediate_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_17_modules_intermediate_modules_dense_parameters_weight_" + shape = [4096, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_17_modules_intermediate_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_17_modules_intermediate_modules_dense_parameters_bias_" + shape = [4096] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_17_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_17_modules_output_modules_dense_parameters_weight_" + shape = [1024, 4096] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_17_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_17_modules_output_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_17_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_17_modules_output_modules_LayerNorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_17_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_17_modules_output_modules_LayerNorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_self_modules_query_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_self_modules_query_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_self_modules_query_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_self_modules_query_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_self_modules_key_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_self_modules_key_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_self_modules_key_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_self_modules_key_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_self_modules_value_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_self_modules_value_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_self_modules_value_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_self_modules_value_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_output_modules_dense_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_output_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_output_modules_LayerNorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_output_modules_LayerNorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_18_modules_intermediate_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_18_modules_intermediate_modules_dense_parameters_weight_" + shape = [4096, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_18_modules_intermediate_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_18_modules_intermediate_modules_dense_parameters_bias_" + shape = [4096] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_18_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_18_modules_output_modules_dense_parameters_weight_" + shape = [1024, 4096] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_18_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_18_modules_output_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_18_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_18_modules_output_modules_LayerNorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_18_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_18_modules_output_modules_LayerNorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_self_modules_query_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_self_modules_query_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_self_modules_query_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_self_modules_query_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_self_modules_key_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_self_modules_key_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_self_modules_key_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_self_modules_key_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_self_modules_value_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_self_modules_value_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_self_modules_value_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_self_modules_value_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_output_modules_dense_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_output_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_output_modules_LayerNorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_output_modules_LayerNorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_19_modules_intermediate_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_19_modules_intermediate_modules_dense_parameters_weight_" + shape = [4096, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_19_modules_intermediate_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_19_modules_intermediate_modules_dense_parameters_bias_" + shape = [4096] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_19_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_19_modules_output_modules_dense_parameters_weight_" + shape = [1024, 4096] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_19_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_19_modules_output_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_19_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_19_modules_output_modules_LayerNorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_19_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_19_modules_output_modules_LayerNorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_self_modules_query_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_self_modules_query_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_self_modules_query_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_self_modules_query_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_self_modules_key_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_self_modules_key_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_self_modules_key_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_self_modules_key_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_self_modules_value_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_self_modules_value_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_self_modules_value_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_self_modules_value_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_output_modules_dense_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_output_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_output_modules_LayerNorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_output_modules_LayerNorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_20_modules_intermediate_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_20_modules_intermediate_modules_dense_parameters_weight_" + shape = [4096, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_20_modules_intermediate_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_20_modules_intermediate_modules_dense_parameters_bias_" + shape = [4096] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_20_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_20_modules_output_modules_dense_parameters_weight_" + shape = [1024, 4096] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_20_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_20_modules_output_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_20_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_20_modules_output_modules_LayerNorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_20_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_20_modules_output_modules_LayerNorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_self_modules_query_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_self_modules_query_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_self_modules_query_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_self_modules_query_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_self_modules_key_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_self_modules_key_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_self_modules_key_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_self_modules_key_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_self_modules_value_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_self_modules_value_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_self_modules_value_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_self_modules_value_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_output_modules_dense_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_output_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_output_modules_LayerNorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_output_modules_LayerNorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_21_modules_intermediate_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_21_modules_intermediate_modules_dense_parameters_weight_" + shape = [4096, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_21_modules_intermediate_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_21_modules_intermediate_modules_dense_parameters_bias_" + shape = [4096] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_21_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_21_modules_output_modules_dense_parameters_weight_" + shape = [1024, 4096] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_21_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_21_modules_output_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_21_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_21_modules_output_modules_LayerNorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_21_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_21_modules_output_modules_LayerNorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_self_modules_query_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_self_modules_query_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_self_modules_query_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_self_modules_query_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_self_modules_key_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_self_modules_key_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_self_modules_key_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_self_modules_key_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_self_modules_value_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_self_modules_value_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_self_modules_value_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_self_modules_value_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_output_modules_dense_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_output_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_output_modules_LayerNorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_output_modules_LayerNorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_22_modules_intermediate_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_22_modules_intermediate_modules_dense_parameters_weight_" + shape = [4096, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_22_modules_intermediate_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_22_modules_intermediate_modules_dense_parameters_bias_" + shape = [4096] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_22_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_22_modules_output_modules_dense_parameters_weight_" + shape = [1024, 4096] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_22_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_22_modules_output_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_22_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_22_modules_output_modules_LayerNorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_22_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_22_modules_output_modules_LayerNorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_self_modules_query_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_self_modules_query_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_self_modules_query_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_self_modules_query_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_self_modules_key_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_self_modules_key_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_self_modules_key_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_self_modules_key_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_self_modules_value_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_self_modules_value_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_self_modules_value_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_self_modules_value_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_output_modules_dense_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_output_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_output_modules_LayerNorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_output_modules_LayerNorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_23_modules_intermediate_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_23_modules_intermediate_modules_dense_parameters_weight_" + shape = [4096, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_23_modules_intermediate_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_23_modules_intermediate_modules_dense_parameters_bias_" + shape = [4096] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_23_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_23_modules_output_modules_dense_parameters_weight_" + shape = [1024, 4096] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_23_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_23_modules_output_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_23_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_23_modules_output_modules_LayerNorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_23_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_23_modules_output_modules_LayerNorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_pooler_modules_dense_parameters_weight_: + name = "L_self_modules_pooler_modules_dense_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_pooler_modules_dense_parameters_bias_: + name = "L_self_modules_pooler_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None diff --git a/samples/transformers-auto-model/VitaliiVrublevskyi_ibert-roberta-base-finetuned-mrpc/graph_hash.txt b/samples/transformers-auto-model/VitaliiVrublevskyi_ibert-roberta-base-finetuned-mrpc/graph_hash.txt new file mode 100644 index 000000000..b94d1a80d --- /dev/null +++ b/samples/transformers-auto-model/VitaliiVrublevskyi_ibert-roberta-base-finetuned-mrpc/graph_hash.txt @@ -0,0 +1 @@ +aa5cccf97f87d4ac1759b92eab1718addd670d30a8edf663d66f0472bd6a8bb7 \ No newline at end of file diff --git a/samples/transformers-auto-model/VitaliiVrublevskyi_ibert-roberta-base-finetuned-mrpc/graph_net.json b/samples/transformers-auto-model/VitaliiVrublevskyi_ibert-roberta-base-finetuned-mrpc/graph_net.json new file mode 100644 index 000000000..b6ffe9f72 --- /dev/null +++ b/samples/transformers-auto-model/VitaliiVrublevskyi_ibert-roberta-base-finetuned-mrpc/graph_net.json @@ -0,0 +1,6 @@ +{ + "framework": "torch", + "num_devices_required": 1, + "num_nodes_required": 1, + "dynamic": false +} \ No newline at end of file diff --git a/samples/transformers-auto-model/VitaliiVrublevskyi_ibert-roberta-base-finetuned-mrpc/input_meta.py b/samples/transformers-auto-model/VitaliiVrublevskyi_ibert-roberta-base-finetuned-mrpc/input_meta.py new file mode 100644 index 000000000..e69de29bb diff --git a/samples/transformers-auto-model/VitaliiVrublevskyi_ibert-roberta-base-finetuned-mrpc/input_tensor_constraints.py b/samples/transformers-auto-model/VitaliiVrublevskyi_ibert-roberta-base-finetuned-mrpc/input_tensor_constraints.py new file mode 100644 index 000000000..e69de29bb diff --git a/samples/transformers-auto-model/VitaliiVrublevskyi_ibert-roberta-base-finetuned-mrpc/model.py b/samples/transformers-auto-model/VitaliiVrublevskyi_ibert-roberta-base-finetuned-mrpc/model.py new file mode 100644 index 000000000..183926b5c --- /dev/null +++ b/samples/transformers-auto-model/VitaliiVrublevskyi_ibert-roberta-base-finetuned-mrpc/model.py @@ -0,0 +1,2058 @@ +import torch + +from torch import device + + +class GraphModule(torch.nn.Module): + def forward( + self, + L_input_ids_: torch.Tensor, + L_attention_mask_: torch.Tensor, + L_self_modules_embeddings_modules_word_embeddings_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_embeddings_modules_token_type_embeddings_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_embeddings_modules_position_embeddings_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_embeddings_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_embeddings_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_query_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_query_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_key_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_key_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_intermediate_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_intermediate_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_query_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_query_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_key_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_key_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_intermediate_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_intermediate_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_query_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_query_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_key_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_key_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_intermediate_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_intermediate_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_query_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_query_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_key_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_key_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_intermediate_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_intermediate_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_query_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_query_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_key_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_key_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_4_modules_intermediate_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_4_modules_intermediate_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_4_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_4_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_4_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_4_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_query_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_query_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_key_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_key_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_5_modules_intermediate_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_5_modules_intermediate_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_5_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_5_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_5_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_5_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_query_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_query_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_key_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_key_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_6_modules_intermediate_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_6_modules_intermediate_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_6_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_6_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_6_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_6_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_query_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_query_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_key_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_key_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_7_modules_intermediate_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_7_modules_intermediate_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_7_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_7_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_7_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_7_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_query_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_query_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_key_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_key_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_8_modules_intermediate_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_8_modules_intermediate_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_8_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_8_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_8_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_8_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_query_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_query_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_key_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_key_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_9_modules_intermediate_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_9_modules_intermediate_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_9_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_9_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_9_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_9_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_query_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_query_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_key_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_key_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_10_modules_intermediate_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_10_modules_intermediate_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_10_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_10_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_10_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_10_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_query_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_query_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_key_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_key_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_11_modules_intermediate_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_11_modules_intermediate_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_11_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_11_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_11_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_11_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_pooler_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_pooler_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + ): + l_input_ids_ = L_input_ids_ + l_attention_mask_ = L_attention_mask_ + l_self_modules_embeddings_modules_word_embeddings_parameters_weight_ = ( + L_self_modules_embeddings_modules_word_embeddings_parameters_weight_ + ) + l_self_modules_embeddings_modules_token_type_embeddings_parameters_weight_ = ( + L_self_modules_embeddings_modules_token_type_embeddings_parameters_weight_ + ) + l_self_modules_embeddings_modules_position_embeddings_parameters_weight_ = ( + L_self_modules_embeddings_modules_position_embeddings_parameters_weight_ + ) + l_self_modules_embeddings_modules_layer_norm_parameters_weight_ = ( + L_self_modules_embeddings_modules_LayerNorm_parameters_weight_ + ) + l_self_modules_embeddings_modules_layer_norm_parameters_bias_ = ( + L_self_modules_embeddings_modules_LayerNorm_parameters_bias_ + ) + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_query_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_query_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_query_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_query_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_key_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_key_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_key_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_key_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_value_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_value_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_value_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_value_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_0_modules_intermediate_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_0_modules_intermediate_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_0_modules_intermediate_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_0_modules_intermediate_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_0_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_0_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_query_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_query_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_query_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_query_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_key_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_key_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_key_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_key_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_value_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_value_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_value_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_value_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_1_modules_intermediate_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_1_modules_intermediate_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_1_modules_intermediate_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_1_modules_intermediate_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_1_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_1_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_query_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_query_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_query_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_query_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_key_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_key_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_key_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_key_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_value_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_value_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_value_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_value_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_2_modules_intermediate_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_2_modules_intermediate_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_2_modules_intermediate_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_2_modules_intermediate_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_2_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_2_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_2_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_2_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_query_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_query_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_query_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_query_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_key_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_key_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_key_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_key_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_value_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_value_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_value_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_value_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_3_modules_intermediate_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_3_modules_intermediate_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_3_modules_intermediate_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_3_modules_intermediate_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_3_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_3_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_3_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_3_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_query_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_query_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_query_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_query_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_key_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_key_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_key_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_key_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_value_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_value_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_value_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_value_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_4_modules_intermediate_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_4_modules_intermediate_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_4_modules_intermediate_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_4_modules_intermediate_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_4_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_4_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_4_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_4_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_4_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_4_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_4_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_4_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_query_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_query_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_query_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_query_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_key_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_key_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_key_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_key_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_value_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_value_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_value_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_value_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_5_modules_intermediate_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_5_modules_intermediate_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_5_modules_intermediate_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_5_modules_intermediate_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_5_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_5_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_5_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_5_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_5_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_5_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_5_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_5_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_query_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_query_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_query_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_query_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_key_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_key_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_key_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_key_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_value_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_value_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_value_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_value_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_6_modules_intermediate_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_6_modules_intermediate_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_6_modules_intermediate_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_6_modules_intermediate_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_6_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_6_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_6_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_6_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_6_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_6_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_6_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_6_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_query_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_query_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_query_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_query_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_key_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_key_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_key_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_key_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_value_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_value_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_value_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_value_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_7_modules_intermediate_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_7_modules_intermediate_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_7_modules_intermediate_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_7_modules_intermediate_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_7_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_7_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_7_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_7_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_7_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_7_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_7_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_7_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_query_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_query_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_query_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_query_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_key_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_key_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_key_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_key_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_value_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_value_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_value_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_value_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_8_modules_intermediate_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_8_modules_intermediate_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_8_modules_intermediate_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_8_modules_intermediate_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_8_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_8_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_8_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_8_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_8_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_8_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_8_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_8_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_query_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_query_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_query_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_query_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_key_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_key_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_key_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_key_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_value_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_value_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_value_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_value_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_9_modules_intermediate_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_9_modules_intermediate_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_9_modules_intermediate_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_9_modules_intermediate_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_9_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_9_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_9_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_9_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_9_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_9_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_9_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_9_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_query_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_query_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_query_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_query_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_key_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_key_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_key_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_key_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_value_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_value_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_value_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_value_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_10_modules_intermediate_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_10_modules_intermediate_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_10_modules_intermediate_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_10_modules_intermediate_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_10_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_10_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_10_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_10_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_10_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_10_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_10_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_10_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_query_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_query_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_query_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_query_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_key_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_key_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_key_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_key_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_value_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_value_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_value_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_value_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_11_modules_intermediate_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_11_modules_intermediate_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_11_modules_intermediate_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_11_modules_intermediate_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_11_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_11_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_11_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_11_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_11_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_11_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_11_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_11_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_pooler_modules_dense_parameters_weight_ = ( + L_self_modules_pooler_modules_dense_parameters_weight_ + ) + l_self_modules_pooler_modules_dense_parameters_bias_ = ( + L_self_modules_pooler_modules_dense_parameters_bias_ + ) + token_type_ids = torch.zeros( + (1, 11), dtype=torch.int64, device=device(type="cuda", index=0) + ) + extended_attention_mask = l_attention_mask_[ + (slice(None, None, None), None, None, slice(None, None, None)) + ] + l_attention_mask_ = None + extended_attention_mask_1 = extended_attention_mask.to(dtype=torch.float32) + extended_attention_mask = None + sub = 1.0 - extended_attention_mask_1 + extended_attention_mask_1 = None + extended_attention_mask_2 = sub * -3.4028234663852886e38 + sub = None + ne = l_input_ids_.ne(1) + mask = ne.int() + ne = None + cumsum = torch.cumsum(mask, dim=1) + type_as = cumsum.type_as(mask) + cumsum = None + add = type_as + 0 + type_as = None + incremental_indices = add * mask + add = mask = None + long = incremental_indices.long() + incremental_indices = None + add_1 = long + 1 + long = None + position_ids = add_1.to(device(type="cuda", index=0)) + add_1 = None + inputs_embeds = torch.nn.functional.embedding( + l_input_ids_, + l_self_modules_embeddings_modules_word_embeddings_parameters_weight_, + 1, + None, + 2.0, + False, + False, + ) + l_input_ids_ = ( + l_self_modules_embeddings_modules_word_embeddings_parameters_weight_ + ) = None + token_type_embeddings = torch.nn.functional.embedding( + token_type_ids, + l_self_modules_embeddings_modules_token_type_embeddings_parameters_weight_, + None, + None, + 2.0, + False, + False, + ) + token_type_ids = ( + l_self_modules_embeddings_modules_token_type_embeddings_parameters_weight_ + ) = None + x_act = token_type_embeddings + inputs_embeds + token_type_embeddings = inputs_embeds = None + position_embeddings = torch.nn.functional.embedding( + position_ids, + l_self_modules_embeddings_modules_position_embeddings_parameters_weight_, + 1, + None, + 2.0, + False, + False, + ) + position_ids = ( + l_self_modules_embeddings_modules_position_embeddings_parameters_weight_ + ) = None + x_act_1 = position_embeddings + x_act + position_embeddings = x_act = None + mean = x_act_1.mean(axis=2, keepdim=True) + y = x_act_1 - mean + x_act_1 = mean = None + pow_1 = y**2 + var = torch.mean(pow_1, axis=2, keepdim=True) + pow_1 = None + add_4 = 1e-05 + var + var = None + sqrt = torch.sqrt(add_4) + add_4 = None + x = y / sqrt + y = sqrt = None + mul_2 = x * l_self_modules_embeddings_modules_layer_norm_parameters_weight_ + x = l_self_modules_embeddings_modules_layer_norm_parameters_weight_ = None + x_1 = mul_2 + l_self_modules_embeddings_modules_layer_norm_parameters_bias_ + mul_2 = l_self_modules_embeddings_modules_layer_norm_parameters_bias_ = None + embeddings = torch.nn.functional.dropout(x_1, 0.1, False, False) + x_1 = None + mixed_query_layer = torch._C._nn.linear( + embeddings, + weight=l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_query_parameters_weight_, + bias=l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_query_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_query_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_query_parameters_bias_ = (None) + mixed_key_layer = torch._C._nn.linear( + embeddings, + weight=l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_key_parameters_weight_, + bias=l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_key_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_key_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_key_parameters_bias_ = (None) + mixed_value_layer = torch._C._nn.linear( + embeddings, + weight=l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_value_parameters_weight_, + bias=l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_value_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_value_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_value_parameters_bias_ = (None) + view = mixed_query_layer.view(1, -1, 12, 64) + mixed_query_layer = None + query_layer = view.transpose(1, 2) + view = None + view_1 = mixed_key_layer.view(1, -1, 12, 64) + mixed_key_layer = None + key_layer = view_1.transpose(1, 2) + view_1 = None + view_2 = mixed_value_layer.view(1, -1, 12, 64) + mixed_value_layer = None + value_layer = view_2.transpose(1, 2) + view_2 = None + transpose_3 = key_layer.transpose(-1, -2) + key_layer = None + attention_scores = torch.matmul(query_layer, transpose_3) + query_layer = transpose_3 = None + attention_scores_1 = attention_scores / 8.0 + attention_scores = None + attention_scores_2 = attention_scores_1 + extended_attention_mask_2 + attention_scores_1 = None + attention_probs = torch.nn.functional.softmax(attention_scores_2, dim=-1) + attention_scores_2 = None + attention_probs_1 = torch.nn.functional.dropout( + attention_probs, 0.1, False, False + ) + attention_probs = None + context_layer = torch.matmul(attention_probs_1, value_layer) + attention_probs_1 = value_layer = None + permute = context_layer.permute(0, 2, 1, 3) + context_layer = None + context_layer_1 = permute.contiguous() + permute = None + context_layer_2 = context_layer_1.view(1, 11, 768) + context_layer_1 = None + hidden_states = torch._C._nn.linear( + context_layer_2, + weight=l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dense_parameters_weight_, + bias=l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dense_parameters_bias_, + ) + context_layer_2 = l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_1 = torch.nn.functional.dropout(hidden_states, 0.1, False, False) + hidden_states = None + x_act_2 = embeddings + hidden_states_1 + embeddings = hidden_states_1 = None + mean_3 = x_act_2.mean(axis=2, keepdim=True) + y_1 = x_act_2 - mean_3 + x_act_2 = mean_3 = None + pow_2 = y_1**2 + var_1 = torch.mean(pow_2, axis=2, keepdim=True) + pow_2 = None + add_8 = 1e-05 + var_1 + var_1 = None + sqrt_1 = torch.sqrt(add_8) + add_8 = None + x_2 = y_1 / sqrt_1 + y_1 = sqrt_1 = None + mul_3 = ( + x_2 + * l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_layer_norm_parameters_weight_ + ) + x_2 = l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = (None) + x_3 = ( + mul_3 + + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_layer_norm_parameters_bias_ + ) + mul_3 = l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = (None) + hidden_states_2 = torch._C._nn.linear( + x_3, + weight=l_self_modules_encoder_modules_layer_modules_0_modules_intermediate_modules_dense_parameters_weight_, + bias=l_self_modules_encoder_modules_layer_modules_0_modules_intermediate_modules_dense_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_0_modules_intermediate_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_0_modules_intermediate_modules_dense_parameters_bias_ = (None) + hidden_states_3 = torch._C._nn.gelu(hidden_states_2, approximate="none") + hidden_states_2 = None + hidden_states_4 = torch._C._nn.linear( + hidden_states_3, + weight=l_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dense_parameters_weight_, + bias=l_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dense_parameters_bias_, + ) + hidden_states_3 = l_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_5 = torch.nn.functional.dropout( + hidden_states_4, 0.1, False, False + ) + hidden_states_4 = None + x_act_3 = x_3 + hidden_states_5 + x_3 = hidden_states_5 = None + mean_6 = x_act_3.mean(axis=2, keepdim=True) + y_2 = x_act_3 - mean_6 + x_act_3 = mean_6 = None + pow_3 = y_2**2 + var_2 = torch.mean(pow_3, axis=2, keepdim=True) + pow_3 = None + add_11 = 1e-05 + var_2 + var_2 = None + sqrt_2 = torch.sqrt(add_11) + add_11 = None + x_4 = y_2 / sqrt_2 + y_2 = sqrt_2 = None + mul_4 = ( + x_4 + * l_self_modules_encoder_modules_layer_modules_0_modules_output_modules_layer_norm_parameters_weight_ + ) + x_4 = l_self_modules_encoder_modules_layer_modules_0_modules_output_modules_layer_norm_parameters_weight_ = (None) + x_5 = ( + mul_4 + + l_self_modules_encoder_modules_layer_modules_0_modules_output_modules_layer_norm_parameters_bias_ + ) + mul_4 = l_self_modules_encoder_modules_layer_modules_0_modules_output_modules_layer_norm_parameters_bias_ = (None) + mixed_query_layer_1 = torch._C._nn.linear( + x_5, + weight=l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_query_parameters_weight_, + bias=l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_query_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_query_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_query_parameters_bias_ = (None) + mixed_key_layer_1 = torch._C._nn.linear( + x_5, + weight=l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_key_parameters_weight_, + bias=l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_key_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_key_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_key_parameters_bias_ = (None) + mixed_value_layer_1 = torch._C._nn.linear( + x_5, + weight=l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_value_parameters_weight_, + bias=l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_value_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_value_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_value_parameters_bias_ = (None) + view_4 = mixed_query_layer_1.view(1, -1, 12, 64) + mixed_query_layer_1 = None + query_layer_1 = view_4.transpose(1, 2) + view_4 = None + view_5 = mixed_key_layer_1.view(1, -1, 12, 64) + mixed_key_layer_1 = None + key_layer_1 = view_5.transpose(1, 2) + view_5 = None + view_6 = mixed_value_layer_1.view(1, -1, 12, 64) + mixed_value_layer_1 = None + value_layer_1 = view_6.transpose(1, 2) + view_6 = None + transpose_7 = key_layer_1.transpose(-1, -2) + key_layer_1 = None + attention_scores_3 = torch.matmul(query_layer_1, transpose_7) + query_layer_1 = transpose_7 = None + attention_scores_4 = attention_scores_3 / 8.0 + attention_scores_3 = None + attention_scores_5 = attention_scores_4 + extended_attention_mask_2 + attention_scores_4 = None + attention_probs_2 = torch.nn.functional.softmax(attention_scores_5, dim=-1) + attention_scores_5 = None + attention_probs_3 = torch.nn.functional.dropout( + attention_probs_2, 0.1, False, False + ) + attention_probs_2 = None + context_layer_3 = torch.matmul(attention_probs_3, value_layer_1) + attention_probs_3 = value_layer_1 = None + permute_1 = context_layer_3.permute(0, 2, 1, 3) + context_layer_3 = None + context_layer_4 = permute_1.contiguous() + permute_1 = None + context_layer_5 = context_layer_4.view(1, 11, 768) + context_layer_4 = None + hidden_states_6 = torch._C._nn.linear( + context_layer_5, + weight=l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dense_parameters_weight_, + bias=l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dense_parameters_bias_, + ) + context_layer_5 = l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_7 = torch.nn.functional.dropout( + hidden_states_6, 0.1, False, False + ) + hidden_states_6 = None + x_act_4 = x_5 + hidden_states_7 + x_5 = hidden_states_7 = None + mean_9 = x_act_4.mean(axis=2, keepdim=True) + y_3 = x_act_4 - mean_9 + x_act_4 = mean_9 = None + pow_4 = y_3**2 + var_3 = torch.mean(pow_4, axis=2, keepdim=True) + pow_4 = None + add_15 = 1e-05 + var_3 + var_3 = None + sqrt_3 = torch.sqrt(add_15) + add_15 = None + x_6 = y_3 / sqrt_3 + y_3 = sqrt_3 = None + mul_5 = ( + x_6 + * l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_layer_norm_parameters_weight_ + ) + x_6 = l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = (None) + x_7 = ( + mul_5 + + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_layer_norm_parameters_bias_ + ) + mul_5 = l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = (None) + hidden_states_8 = torch._C._nn.linear( + x_7, + weight=l_self_modules_encoder_modules_layer_modules_1_modules_intermediate_modules_dense_parameters_weight_, + bias=l_self_modules_encoder_modules_layer_modules_1_modules_intermediate_modules_dense_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_1_modules_intermediate_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_1_modules_intermediate_modules_dense_parameters_bias_ = (None) + hidden_states_9 = torch._C._nn.gelu(hidden_states_8, approximate="none") + hidden_states_8 = None + hidden_states_10 = torch._C._nn.linear( + hidden_states_9, + weight=l_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dense_parameters_weight_, + bias=l_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dense_parameters_bias_, + ) + hidden_states_9 = l_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_11 = torch.nn.functional.dropout( + hidden_states_10, 0.1, False, False + ) + hidden_states_10 = None + x_act_5 = x_7 + hidden_states_11 + x_7 = hidden_states_11 = None + mean_12 = x_act_5.mean(axis=2, keepdim=True) + y_4 = x_act_5 - mean_12 + x_act_5 = mean_12 = None + pow_5 = y_4**2 + var_4 = torch.mean(pow_5, axis=2, keepdim=True) + pow_5 = None + add_18 = 1e-05 + var_4 + var_4 = None + sqrt_4 = torch.sqrt(add_18) + add_18 = None + x_8 = y_4 / sqrt_4 + y_4 = sqrt_4 = None + mul_6 = ( + x_8 + * l_self_modules_encoder_modules_layer_modules_1_modules_output_modules_layer_norm_parameters_weight_ + ) + x_8 = l_self_modules_encoder_modules_layer_modules_1_modules_output_modules_layer_norm_parameters_weight_ = (None) + x_9 = ( + mul_6 + + l_self_modules_encoder_modules_layer_modules_1_modules_output_modules_layer_norm_parameters_bias_ + ) + mul_6 = l_self_modules_encoder_modules_layer_modules_1_modules_output_modules_layer_norm_parameters_bias_ = (None) + mixed_query_layer_2 = torch._C._nn.linear( + x_9, + weight=l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_query_parameters_weight_, + bias=l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_query_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_query_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_query_parameters_bias_ = (None) + mixed_key_layer_2 = torch._C._nn.linear( + x_9, + weight=l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_key_parameters_weight_, + bias=l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_key_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_key_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_key_parameters_bias_ = (None) + mixed_value_layer_2 = torch._C._nn.linear( + x_9, + weight=l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_value_parameters_weight_, + bias=l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_value_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_value_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_value_parameters_bias_ = (None) + view_8 = mixed_query_layer_2.view(1, -1, 12, 64) + mixed_query_layer_2 = None + query_layer_2 = view_8.transpose(1, 2) + view_8 = None + view_9 = mixed_key_layer_2.view(1, -1, 12, 64) + mixed_key_layer_2 = None + key_layer_2 = view_9.transpose(1, 2) + view_9 = None + view_10 = mixed_value_layer_2.view(1, -1, 12, 64) + mixed_value_layer_2 = None + value_layer_2 = view_10.transpose(1, 2) + view_10 = None + transpose_11 = key_layer_2.transpose(-1, -2) + key_layer_2 = None + attention_scores_6 = torch.matmul(query_layer_2, transpose_11) + query_layer_2 = transpose_11 = None + attention_scores_7 = attention_scores_6 / 8.0 + attention_scores_6 = None + attention_scores_8 = attention_scores_7 + extended_attention_mask_2 + attention_scores_7 = None + attention_probs_4 = torch.nn.functional.softmax(attention_scores_8, dim=-1) + attention_scores_8 = None + attention_probs_5 = torch.nn.functional.dropout( + attention_probs_4, 0.1, False, False + ) + attention_probs_4 = None + context_layer_6 = torch.matmul(attention_probs_5, value_layer_2) + attention_probs_5 = value_layer_2 = None + permute_2 = context_layer_6.permute(0, 2, 1, 3) + context_layer_6 = None + context_layer_7 = permute_2.contiguous() + permute_2 = None + context_layer_8 = context_layer_7.view(1, 11, 768) + context_layer_7 = None + hidden_states_12 = torch._C._nn.linear( + context_layer_8, + weight=l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_dense_parameters_weight_, + bias=l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_dense_parameters_bias_, + ) + context_layer_8 = l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_13 = torch.nn.functional.dropout( + hidden_states_12, 0.1, False, False + ) + hidden_states_12 = None + x_act_6 = x_9 + hidden_states_13 + x_9 = hidden_states_13 = None + mean_15 = x_act_6.mean(axis=2, keepdim=True) + y_5 = x_act_6 - mean_15 + x_act_6 = mean_15 = None + pow_6 = y_5**2 + var_5 = torch.mean(pow_6, axis=2, keepdim=True) + pow_6 = None + add_22 = 1e-05 + var_5 + var_5 = None + sqrt_5 = torch.sqrt(add_22) + add_22 = None + x_10 = y_5 / sqrt_5 + y_5 = sqrt_5 = None + mul_7 = ( + x_10 + * l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_layer_norm_parameters_weight_ + ) + x_10 = l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = (None) + x_11 = ( + mul_7 + + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_layer_norm_parameters_bias_ + ) + mul_7 = l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = (None) + hidden_states_14 = torch._C._nn.linear( + x_11, + weight=l_self_modules_encoder_modules_layer_modules_2_modules_intermediate_modules_dense_parameters_weight_, + bias=l_self_modules_encoder_modules_layer_modules_2_modules_intermediate_modules_dense_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_2_modules_intermediate_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_2_modules_intermediate_modules_dense_parameters_bias_ = (None) + hidden_states_15 = torch._C._nn.gelu(hidden_states_14, approximate="none") + hidden_states_14 = None + hidden_states_16 = torch._C._nn.linear( + hidden_states_15, + weight=l_self_modules_encoder_modules_layer_modules_2_modules_output_modules_dense_parameters_weight_, + bias=l_self_modules_encoder_modules_layer_modules_2_modules_output_modules_dense_parameters_bias_, + ) + hidden_states_15 = l_self_modules_encoder_modules_layer_modules_2_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_2_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_17 = torch.nn.functional.dropout( + hidden_states_16, 0.1, False, False + ) + hidden_states_16 = None + x_act_7 = x_11 + hidden_states_17 + x_11 = hidden_states_17 = None + mean_18 = x_act_7.mean(axis=2, keepdim=True) + y_6 = x_act_7 - mean_18 + x_act_7 = mean_18 = None + pow_7 = y_6**2 + var_6 = torch.mean(pow_7, axis=2, keepdim=True) + pow_7 = None + add_25 = 1e-05 + var_6 + var_6 = None + sqrt_6 = torch.sqrt(add_25) + add_25 = None + x_12 = y_6 / sqrt_6 + y_6 = sqrt_6 = None + mul_8 = ( + x_12 + * l_self_modules_encoder_modules_layer_modules_2_modules_output_modules_layer_norm_parameters_weight_ + ) + x_12 = l_self_modules_encoder_modules_layer_modules_2_modules_output_modules_layer_norm_parameters_weight_ = (None) + x_13 = ( + mul_8 + + l_self_modules_encoder_modules_layer_modules_2_modules_output_modules_layer_norm_parameters_bias_ + ) + mul_8 = l_self_modules_encoder_modules_layer_modules_2_modules_output_modules_layer_norm_parameters_bias_ = (None) + mixed_query_layer_3 = torch._C._nn.linear( + x_13, + weight=l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_query_parameters_weight_, + bias=l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_query_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_query_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_query_parameters_bias_ = (None) + mixed_key_layer_3 = torch._C._nn.linear( + x_13, + weight=l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_key_parameters_weight_, + bias=l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_key_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_key_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_key_parameters_bias_ = (None) + mixed_value_layer_3 = torch._C._nn.linear( + x_13, + weight=l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_value_parameters_weight_, + bias=l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_value_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_value_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_value_parameters_bias_ = (None) + view_12 = mixed_query_layer_3.view(1, -1, 12, 64) + mixed_query_layer_3 = None + query_layer_3 = view_12.transpose(1, 2) + view_12 = None + view_13 = mixed_key_layer_3.view(1, -1, 12, 64) + mixed_key_layer_3 = None + key_layer_3 = view_13.transpose(1, 2) + view_13 = None + view_14 = mixed_value_layer_3.view(1, -1, 12, 64) + mixed_value_layer_3 = None + value_layer_3 = view_14.transpose(1, 2) + view_14 = None + transpose_15 = key_layer_3.transpose(-1, -2) + key_layer_3 = None + attention_scores_9 = torch.matmul(query_layer_3, transpose_15) + query_layer_3 = transpose_15 = None + attention_scores_10 = attention_scores_9 / 8.0 + attention_scores_9 = None + attention_scores_11 = attention_scores_10 + extended_attention_mask_2 + attention_scores_10 = None + attention_probs_6 = torch.nn.functional.softmax(attention_scores_11, dim=-1) + attention_scores_11 = None + attention_probs_7 = torch.nn.functional.dropout( + attention_probs_6, 0.1, False, False + ) + attention_probs_6 = None + context_layer_9 = torch.matmul(attention_probs_7, value_layer_3) + attention_probs_7 = value_layer_3 = None + permute_3 = context_layer_9.permute(0, 2, 1, 3) + context_layer_9 = None + context_layer_10 = permute_3.contiguous() + permute_3 = None + context_layer_11 = context_layer_10.view(1, 11, 768) + context_layer_10 = None + hidden_states_18 = torch._C._nn.linear( + context_layer_11, + weight=l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_dense_parameters_weight_, + bias=l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_dense_parameters_bias_, + ) + context_layer_11 = l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_19 = torch.nn.functional.dropout( + hidden_states_18, 0.1, False, False + ) + hidden_states_18 = None + x_act_8 = x_13 + hidden_states_19 + x_13 = hidden_states_19 = None + mean_21 = x_act_8.mean(axis=2, keepdim=True) + y_7 = x_act_8 - mean_21 + x_act_8 = mean_21 = None + pow_8 = y_7**2 + var_7 = torch.mean(pow_8, axis=2, keepdim=True) + pow_8 = None + add_29 = 1e-05 + var_7 + var_7 = None + sqrt_7 = torch.sqrt(add_29) + add_29 = None + x_14 = y_7 / sqrt_7 + y_7 = sqrt_7 = None + mul_9 = ( + x_14 + * l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_layer_norm_parameters_weight_ + ) + x_14 = l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = (None) + x_15 = ( + mul_9 + + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_layer_norm_parameters_bias_ + ) + mul_9 = l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = (None) + hidden_states_20 = torch._C._nn.linear( + x_15, + weight=l_self_modules_encoder_modules_layer_modules_3_modules_intermediate_modules_dense_parameters_weight_, + bias=l_self_modules_encoder_modules_layer_modules_3_modules_intermediate_modules_dense_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_3_modules_intermediate_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_3_modules_intermediate_modules_dense_parameters_bias_ = (None) + hidden_states_21 = torch._C._nn.gelu(hidden_states_20, approximate="none") + hidden_states_20 = None + hidden_states_22 = torch._C._nn.linear( + hidden_states_21, + weight=l_self_modules_encoder_modules_layer_modules_3_modules_output_modules_dense_parameters_weight_, + bias=l_self_modules_encoder_modules_layer_modules_3_modules_output_modules_dense_parameters_bias_, + ) + hidden_states_21 = l_self_modules_encoder_modules_layer_modules_3_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_3_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_23 = torch.nn.functional.dropout( + hidden_states_22, 0.1, False, False + ) + hidden_states_22 = None + x_act_9 = x_15 + hidden_states_23 + x_15 = hidden_states_23 = None + mean_24 = x_act_9.mean(axis=2, keepdim=True) + y_8 = x_act_9 - mean_24 + x_act_9 = mean_24 = None + pow_9 = y_8**2 + var_8 = torch.mean(pow_9, axis=2, keepdim=True) + pow_9 = None + add_32 = 1e-05 + var_8 + var_8 = None + sqrt_8 = torch.sqrt(add_32) + add_32 = None + x_16 = y_8 / sqrt_8 + y_8 = sqrt_8 = None + mul_10 = ( + x_16 + * l_self_modules_encoder_modules_layer_modules_3_modules_output_modules_layer_norm_parameters_weight_ + ) + x_16 = l_self_modules_encoder_modules_layer_modules_3_modules_output_modules_layer_norm_parameters_weight_ = (None) + x_17 = ( + mul_10 + + l_self_modules_encoder_modules_layer_modules_3_modules_output_modules_layer_norm_parameters_bias_ + ) + mul_10 = l_self_modules_encoder_modules_layer_modules_3_modules_output_modules_layer_norm_parameters_bias_ = (None) + mixed_query_layer_4 = torch._C._nn.linear( + x_17, + weight=l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_query_parameters_weight_, + bias=l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_query_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_query_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_query_parameters_bias_ = (None) + mixed_key_layer_4 = torch._C._nn.linear( + x_17, + weight=l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_key_parameters_weight_, + bias=l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_key_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_key_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_key_parameters_bias_ = (None) + mixed_value_layer_4 = torch._C._nn.linear( + x_17, + weight=l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_value_parameters_weight_, + bias=l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_value_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_value_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_value_parameters_bias_ = (None) + view_16 = mixed_query_layer_4.view(1, -1, 12, 64) + mixed_query_layer_4 = None + query_layer_4 = view_16.transpose(1, 2) + view_16 = None + view_17 = mixed_key_layer_4.view(1, -1, 12, 64) + mixed_key_layer_4 = None + key_layer_4 = view_17.transpose(1, 2) + view_17 = None + view_18 = mixed_value_layer_4.view(1, -1, 12, 64) + mixed_value_layer_4 = None + value_layer_4 = view_18.transpose(1, 2) + view_18 = None + transpose_19 = key_layer_4.transpose(-1, -2) + key_layer_4 = None + attention_scores_12 = torch.matmul(query_layer_4, transpose_19) + query_layer_4 = transpose_19 = None + attention_scores_13 = attention_scores_12 / 8.0 + attention_scores_12 = None + attention_scores_14 = attention_scores_13 + extended_attention_mask_2 + attention_scores_13 = None + attention_probs_8 = torch.nn.functional.softmax(attention_scores_14, dim=-1) + attention_scores_14 = None + attention_probs_9 = torch.nn.functional.dropout( + attention_probs_8, 0.1, False, False + ) + attention_probs_8 = None + context_layer_12 = torch.matmul(attention_probs_9, value_layer_4) + attention_probs_9 = value_layer_4 = None + permute_4 = context_layer_12.permute(0, 2, 1, 3) + context_layer_12 = None + context_layer_13 = permute_4.contiguous() + permute_4 = None + context_layer_14 = context_layer_13.view(1, 11, 768) + context_layer_13 = None + hidden_states_24 = torch._C._nn.linear( + context_layer_14, + weight=l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_dense_parameters_weight_, + bias=l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_dense_parameters_bias_, + ) + context_layer_14 = l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_25 = torch.nn.functional.dropout( + hidden_states_24, 0.1, False, False + ) + hidden_states_24 = None + x_act_10 = x_17 + hidden_states_25 + x_17 = hidden_states_25 = None + mean_27 = x_act_10.mean(axis=2, keepdim=True) + y_9 = x_act_10 - mean_27 + x_act_10 = mean_27 = None + pow_10 = y_9**2 + var_9 = torch.mean(pow_10, axis=2, keepdim=True) + pow_10 = None + add_36 = 1e-05 + var_9 + var_9 = None + sqrt_9 = torch.sqrt(add_36) + add_36 = None + x_18 = y_9 / sqrt_9 + y_9 = sqrt_9 = None + mul_11 = ( + x_18 + * l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_layer_norm_parameters_weight_ + ) + x_18 = l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = (None) + x_19 = ( + mul_11 + + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_layer_norm_parameters_bias_ + ) + mul_11 = l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = (None) + hidden_states_26 = torch._C._nn.linear( + x_19, + weight=l_self_modules_encoder_modules_layer_modules_4_modules_intermediate_modules_dense_parameters_weight_, + bias=l_self_modules_encoder_modules_layer_modules_4_modules_intermediate_modules_dense_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_4_modules_intermediate_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_4_modules_intermediate_modules_dense_parameters_bias_ = (None) + hidden_states_27 = torch._C._nn.gelu(hidden_states_26, approximate="none") + hidden_states_26 = None + hidden_states_28 = torch._C._nn.linear( + hidden_states_27, + weight=l_self_modules_encoder_modules_layer_modules_4_modules_output_modules_dense_parameters_weight_, + bias=l_self_modules_encoder_modules_layer_modules_4_modules_output_modules_dense_parameters_bias_, + ) + hidden_states_27 = l_self_modules_encoder_modules_layer_modules_4_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_4_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_29 = torch.nn.functional.dropout( + hidden_states_28, 0.1, False, False + ) + hidden_states_28 = None + x_act_11 = x_19 + hidden_states_29 + x_19 = hidden_states_29 = None + mean_30 = x_act_11.mean(axis=2, keepdim=True) + y_10 = x_act_11 - mean_30 + x_act_11 = mean_30 = None + pow_11 = y_10**2 + var_10 = torch.mean(pow_11, axis=2, keepdim=True) + pow_11 = None + add_39 = 1e-05 + var_10 + var_10 = None + sqrt_10 = torch.sqrt(add_39) + add_39 = None + x_20 = y_10 / sqrt_10 + y_10 = sqrt_10 = None + mul_12 = ( + x_20 + * l_self_modules_encoder_modules_layer_modules_4_modules_output_modules_layer_norm_parameters_weight_ + ) + x_20 = l_self_modules_encoder_modules_layer_modules_4_modules_output_modules_layer_norm_parameters_weight_ = (None) + x_21 = ( + mul_12 + + l_self_modules_encoder_modules_layer_modules_4_modules_output_modules_layer_norm_parameters_bias_ + ) + mul_12 = l_self_modules_encoder_modules_layer_modules_4_modules_output_modules_layer_norm_parameters_bias_ = (None) + mixed_query_layer_5 = torch._C._nn.linear( + x_21, + weight=l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_query_parameters_weight_, + bias=l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_query_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_query_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_query_parameters_bias_ = (None) + mixed_key_layer_5 = torch._C._nn.linear( + x_21, + weight=l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_key_parameters_weight_, + bias=l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_key_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_key_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_key_parameters_bias_ = (None) + mixed_value_layer_5 = torch._C._nn.linear( + x_21, + weight=l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_value_parameters_weight_, + bias=l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_value_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_value_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_value_parameters_bias_ = (None) + view_20 = mixed_query_layer_5.view(1, -1, 12, 64) + mixed_query_layer_5 = None + query_layer_5 = view_20.transpose(1, 2) + view_20 = None + view_21 = mixed_key_layer_5.view(1, -1, 12, 64) + mixed_key_layer_5 = None + key_layer_5 = view_21.transpose(1, 2) + view_21 = None + view_22 = mixed_value_layer_5.view(1, -1, 12, 64) + mixed_value_layer_5 = None + value_layer_5 = view_22.transpose(1, 2) + view_22 = None + transpose_23 = key_layer_5.transpose(-1, -2) + key_layer_5 = None + attention_scores_15 = torch.matmul(query_layer_5, transpose_23) + query_layer_5 = transpose_23 = None + attention_scores_16 = attention_scores_15 / 8.0 + attention_scores_15 = None + attention_scores_17 = attention_scores_16 + extended_attention_mask_2 + attention_scores_16 = None + attention_probs_10 = torch.nn.functional.softmax(attention_scores_17, dim=-1) + attention_scores_17 = None + attention_probs_11 = torch.nn.functional.dropout( + attention_probs_10, 0.1, False, False + ) + attention_probs_10 = None + context_layer_15 = torch.matmul(attention_probs_11, value_layer_5) + attention_probs_11 = value_layer_5 = None + permute_5 = context_layer_15.permute(0, 2, 1, 3) + context_layer_15 = None + context_layer_16 = permute_5.contiguous() + permute_5 = None + context_layer_17 = context_layer_16.view(1, 11, 768) + context_layer_16 = None + hidden_states_30 = torch._C._nn.linear( + context_layer_17, + weight=l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_dense_parameters_weight_, + bias=l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_dense_parameters_bias_, + ) + context_layer_17 = l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_31 = torch.nn.functional.dropout( + hidden_states_30, 0.1, False, False + ) + hidden_states_30 = None + x_act_12 = x_21 + hidden_states_31 + x_21 = hidden_states_31 = None + mean_33 = x_act_12.mean(axis=2, keepdim=True) + y_11 = x_act_12 - mean_33 + x_act_12 = mean_33 = None + pow_12 = y_11**2 + var_11 = torch.mean(pow_12, axis=2, keepdim=True) + pow_12 = None + add_43 = 1e-05 + var_11 + var_11 = None + sqrt_11 = torch.sqrt(add_43) + add_43 = None + x_22 = y_11 / sqrt_11 + y_11 = sqrt_11 = None + mul_13 = ( + x_22 + * l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_layer_norm_parameters_weight_ + ) + x_22 = l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = (None) + x_23 = ( + mul_13 + + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_layer_norm_parameters_bias_ + ) + mul_13 = l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = (None) + hidden_states_32 = torch._C._nn.linear( + x_23, + weight=l_self_modules_encoder_modules_layer_modules_5_modules_intermediate_modules_dense_parameters_weight_, + bias=l_self_modules_encoder_modules_layer_modules_5_modules_intermediate_modules_dense_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_5_modules_intermediate_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_5_modules_intermediate_modules_dense_parameters_bias_ = (None) + hidden_states_33 = torch._C._nn.gelu(hidden_states_32, approximate="none") + hidden_states_32 = None + hidden_states_34 = torch._C._nn.linear( + hidden_states_33, + weight=l_self_modules_encoder_modules_layer_modules_5_modules_output_modules_dense_parameters_weight_, + bias=l_self_modules_encoder_modules_layer_modules_5_modules_output_modules_dense_parameters_bias_, + ) + hidden_states_33 = l_self_modules_encoder_modules_layer_modules_5_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_5_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_35 = torch.nn.functional.dropout( + hidden_states_34, 0.1, False, False + ) + hidden_states_34 = None + x_act_13 = x_23 + hidden_states_35 + x_23 = hidden_states_35 = None + mean_36 = x_act_13.mean(axis=2, keepdim=True) + y_12 = x_act_13 - mean_36 + x_act_13 = mean_36 = None + pow_13 = y_12**2 + var_12 = torch.mean(pow_13, axis=2, keepdim=True) + pow_13 = None + add_46 = 1e-05 + var_12 + var_12 = None + sqrt_12 = torch.sqrt(add_46) + add_46 = None + x_24 = y_12 / sqrt_12 + y_12 = sqrt_12 = None + mul_14 = ( + x_24 + * l_self_modules_encoder_modules_layer_modules_5_modules_output_modules_layer_norm_parameters_weight_ + ) + x_24 = l_self_modules_encoder_modules_layer_modules_5_modules_output_modules_layer_norm_parameters_weight_ = (None) + x_25 = ( + mul_14 + + l_self_modules_encoder_modules_layer_modules_5_modules_output_modules_layer_norm_parameters_bias_ + ) + mul_14 = l_self_modules_encoder_modules_layer_modules_5_modules_output_modules_layer_norm_parameters_bias_ = (None) + mixed_query_layer_6 = torch._C._nn.linear( + x_25, + weight=l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_query_parameters_weight_, + bias=l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_query_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_query_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_query_parameters_bias_ = (None) + mixed_key_layer_6 = torch._C._nn.linear( + x_25, + weight=l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_key_parameters_weight_, + bias=l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_key_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_key_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_key_parameters_bias_ = (None) + mixed_value_layer_6 = torch._C._nn.linear( + x_25, + weight=l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_value_parameters_weight_, + bias=l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_value_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_value_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_value_parameters_bias_ = (None) + view_24 = mixed_query_layer_6.view(1, -1, 12, 64) + mixed_query_layer_6 = None + query_layer_6 = view_24.transpose(1, 2) + view_24 = None + view_25 = mixed_key_layer_6.view(1, -1, 12, 64) + mixed_key_layer_6 = None + key_layer_6 = view_25.transpose(1, 2) + view_25 = None + view_26 = mixed_value_layer_6.view(1, -1, 12, 64) + mixed_value_layer_6 = None + value_layer_6 = view_26.transpose(1, 2) + view_26 = None + transpose_27 = key_layer_6.transpose(-1, -2) + key_layer_6 = None + attention_scores_18 = torch.matmul(query_layer_6, transpose_27) + query_layer_6 = transpose_27 = None + attention_scores_19 = attention_scores_18 / 8.0 + attention_scores_18 = None + attention_scores_20 = attention_scores_19 + extended_attention_mask_2 + attention_scores_19 = None + attention_probs_12 = torch.nn.functional.softmax(attention_scores_20, dim=-1) + attention_scores_20 = None + attention_probs_13 = torch.nn.functional.dropout( + attention_probs_12, 0.1, False, False + ) + attention_probs_12 = None + context_layer_18 = torch.matmul(attention_probs_13, value_layer_6) + attention_probs_13 = value_layer_6 = None + permute_6 = context_layer_18.permute(0, 2, 1, 3) + context_layer_18 = None + context_layer_19 = permute_6.contiguous() + permute_6 = None + context_layer_20 = context_layer_19.view(1, 11, 768) + context_layer_19 = None + hidden_states_36 = torch._C._nn.linear( + context_layer_20, + weight=l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_dense_parameters_weight_, + bias=l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_dense_parameters_bias_, + ) + context_layer_20 = l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_37 = torch.nn.functional.dropout( + hidden_states_36, 0.1, False, False + ) + hidden_states_36 = None + x_act_14 = x_25 + hidden_states_37 + x_25 = hidden_states_37 = None + mean_39 = x_act_14.mean(axis=2, keepdim=True) + y_13 = x_act_14 - mean_39 + x_act_14 = mean_39 = None + pow_14 = y_13**2 + var_13 = torch.mean(pow_14, axis=2, keepdim=True) + pow_14 = None + add_50 = 1e-05 + var_13 + var_13 = None + sqrt_13 = torch.sqrt(add_50) + add_50 = None + x_26 = y_13 / sqrt_13 + y_13 = sqrt_13 = None + mul_15 = ( + x_26 + * l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_layer_norm_parameters_weight_ + ) + x_26 = l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = (None) + x_27 = ( + mul_15 + + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_layer_norm_parameters_bias_ + ) + mul_15 = l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = (None) + hidden_states_38 = torch._C._nn.linear( + x_27, + weight=l_self_modules_encoder_modules_layer_modules_6_modules_intermediate_modules_dense_parameters_weight_, + bias=l_self_modules_encoder_modules_layer_modules_6_modules_intermediate_modules_dense_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_6_modules_intermediate_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_6_modules_intermediate_modules_dense_parameters_bias_ = (None) + hidden_states_39 = torch._C._nn.gelu(hidden_states_38, approximate="none") + hidden_states_38 = None + hidden_states_40 = torch._C._nn.linear( + hidden_states_39, + weight=l_self_modules_encoder_modules_layer_modules_6_modules_output_modules_dense_parameters_weight_, + bias=l_self_modules_encoder_modules_layer_modules_6_modules_output_modules_dense_parameters_bias_, + ) + hidden_states_39 = l_self_modules_encoder_modules_layer_modules_6_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_6_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_41 = torch.nn.functional.dropout( + hidden_states_40, 0.1, False, False + ) + hidden_states_40 = None + x_act_15 = x_27 + hidden_states_41 + x_27 = hidden_states_41 = None + mean_42 = x_act_15.mean(axis=2, keepdim=True) + y_14 = x_act_15 - mean_42 + x_act_15 = mean_42 = None + pow_15 = y_14**2 + var_14 = torch.mean(pow_15, axis=2, keepdim=True) + pow_15 = None + add_53 = 1e-05 + var_14 + var_14 = None + sqrt_14 = torch.sqrt(add_53) + add_53 = None + x_28 = y_14 / sqrt_14 + y_14 = sqrt_14 = None + mul_16 = ( + x_28 + * l_self_modules_encoder_modules_layer_modules_6_modules_output_modules_layer_norm_parameters_weight_ + ) + x_28 = l_self_modules_encoder_modules_layer_modules_6_modules_output_modules_layer_norm_parameters_weight_ = (None) + x_29 = ( + mul_16 + + l_self_modules_encoder_modules_layer_modules_6_modules_output_modules_layer_norm_parameters_bias_ + ) + mul_16 = l_self_modules_encoder_modules_layer_modules_6_modules_output_modules_layer_norm_parameters_bias_ = (None) + mixed_query_layer_7 = torch._C._nn.linear( + x_29, + weight=l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_query_parameters_weight_, + bias=l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_query_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_query_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_query_parameters_bias_ = (None) + mixed_key_layer_7 = torch._C._nn.linear( + x_29, + weight=l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_key_parameters_weight_, + bias=l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_key_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_key_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_key_parameters_bias_ = (None) + mixed_value_layer_7 = torch._C._nn.linear( + x_29, + weight=l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_value_parameters_weight_, + bias=l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_value_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_value_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_value_parameters_bias_ = (None) + view_28 = mixed_query_layer_7.view(1, -1, 12, 64) + mixed_query_layer_7 = None + query_layer_7 = view_28.transpose(1, 2) + view_28 = None + view_29 = mixed_key_layer_7.view(1, -1, 12, 64) + mixed_key_layer_7 = None + key_layer_7 = view_29.transpose(1, 2) + view_29 = None + view_30 = mixed_value_layer_7.view(1, -1, 12, 64) + mixed_value_layer_7 = None + value_layer_7 = view_30.transpose(1, 2) + view_30 = None + transpose_31 = key_layer_7.transpose(-1, -2) + key_layer_7 = None + attention_scores_21 = torch.matmul(query_layer_7, transpose_31) + query_layer_7 = transpose_31 = None + attention_scores_22 = attention_scores_21 / 8.0 + attention_scores_21 = None + attention_scores_23 = attention_scores_22 + extended_attention_mask_2 + attention_scores_22 = None + attention_probs_14 = torch.nn.functional.softmax(attention_scores_23, dim=-1) + attention_scores_23 = None + attention_probs_15 = torch.nn.functional.dropout( + attention_probs_14, 0.1, False, False + ) + attention_probs_14 = None + context_layer_21 = torch.matmul(attention_probs_15, value_layer_7) + attention_probs_15 = value_layer_7 = None + permute_7 = context_layer_21.permute(0, 2, 1, 3) + context_layer_21 = None + context_layer_22 = permute_7.contiguous() + permute_7 = None + context_layer_23 = context_layer_22.view(1, 11, 768) + context_layer_22 = None + hidden_states_42 = torch._C._nn.linear( + context_layer_23, + weight=l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_dense_parameters_weight_, + bias=l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_dense_parameters_bias_, + ) + context_layer_23 = l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_43 = torch.nn.functional.dropout( + hidden_states_42, 0.1, False, False + ) + hidden_states_42 = None + x_act_16 = x_29 + hidden_states_43 + x_29 = hidden_states_43 = None + mean_45 = x_act_16.mean(axis=2, keepdim=True) + y_15 = x_act_16 - mean_45 + x_act_16 = mean_45 = None + pow_16 = y_15**2 + var_15 = torch.mean(pow_16, axis=2, keepdim=True) + pow_16 = None + add_57 = 1e-05 + var_15 + var_15 = None + sqrt_15 = torch.sqrt(add_57) + add_57 = None + x_30 = y_15 / sqrt_15 + y_15 = sqrt_15 = None + mul_17 = ( + x_30 + * l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_layer_norm_parameters_weight_ + ) + x_30 = l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = (None) + x_31 = ( + mul_17 + + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_layer_norm_parameters_bias_ + ) + mul_17 = l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = (None) + hidden_states_44 = torch._C._nn.linear( + x_31, + weight=l_self_modules_encoder_modules_layer_modules_7_modules_intermediate_modules_dense_parameters_weight_, + bias=l_self_modules_encoder_modules_layer_modules_7_modules_intermediate_modules_dense_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_7_modules_intermediate_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_7_modules_intermediate_modules_dense_parameters_bias_ = (None) + hidden_states_45 = torch._C._nn.gelu(hidden_states_44, approximate="none") + hidden_states_44 = None + hidden_states_46 = torch._C._nn.linear( + hidden_states_45, + weight=l_self_modules_encoder_modules_layer_modules_7_modules_output_modules_dense_parameters_weight_, + bias=l_self_modules_encoder_modules_layer_modules_7_modules_output_modules_dense_parameters_bias_, + ) + hidden_states_45 = l_self_modules_encoder_modules_layer_modules_7_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_7_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_47 = torch.nn.functional.dropout( + hidden_states_46, 0.1, False, False + ) + hidden_states_46 = None + x_act_17 = x_31 + hidden_states_47 + x_31 = hidden_states_47 = None + mean_48 = x_act_17.mean(axis=2, keepdim=True) + y_16 = x_act_17 - mean_48 + x_act_17 = mean_48 = None + pow_17 = y_16**2 + var_16 = torch.mean(pow_17, axis=2, keepdim=True) + pow_17 = None + add_60 = 1e-05 + var_16 + var_16 = None + sqrt_16 = torch.sqrt(add_60) + add_60 = None + x_32 = y_16 / sqrt_16 + y_16 = sqrt_16 = None + mul_18 = ( + x_32 + * l_self_modules_encoder_modules_layer_modules_7_modules_output_modules_layer_norm_parameters_weight_ + ) + x_32 = l_self_modules_encoder_modules_layer_modules_7_modules_output_modules_layer_norm_parameters_weight_ = (None) + x_33 = ( + mul_18 + + l_self_modules_encoder_modules_layer_modules_7_modules_output_modules_layer_norm_parameters_bias_ + ) + mul_18 = l_self_modules_encoder_modules_layer_modules_7_modules_output_modules_layer_norm_parameters_bias_ = (None) + mixed_query_layer_8 = torch._C._nn.linear( + x_33, + weight=l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_query_parameters_weight_, + bias=l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_query_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_query_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_query_parameters_bias_ = (None) + mixed_key_layer_8 = torch._C._nn.linear( + x_33, + weight=l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_key_parameters_weight_, + bias=l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_key_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_key_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_key_parameters_bias_ = (None) + mixed_value_layer_8 = torch._C._nn.linear( + x_33, + weight=l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_value_parameters_weight_, + bias=l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_value_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_value_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_value_parameters_bias_ = (None) + view_32 = mixed_query_layer_8.view(1, -1, 12, 64) + mixed_query_layer_8 = None + query_layer_8 = view_32.transpose(1, 2) + view_32 = None + view_33 = mixed_key_layer_8.view(1, -1, 12, 64) + mixed_key_layer_8 = None + key_layer_8 = view_33.transpose(1, 2) + view_33 = None + view_34 = mixed_value_layer_8.view(1, -1, 12, 64) + mixed_value_layer_8 = None + value_layer_8 = view_34.transpose(1, 2) + view_34 = None + transpose_35 = key_layer_8.transpose(-1, -2) + key_layer_8 = None + attention_scores_24 = torch.matmul(query_layer_8, transpose_35) + query_layer_8 = transpose_35 = None + attention_scores_25 = attention_scores_24 / 8.0 + attention_scores_24 = None + attention_scores_26 = attention_scores_25 + extended_attention_mask_2 + attention_scores_25 = None + attention_probs_16 = torch.nn.functional.softmax(attention_scores_26, dim=-1) + attention_scores_26 = None + attention_probs_17 = torch.nn.functional.dropout( + attention_probs_16, 0.1, False, False + ) + attention_probs_16 = None + context_layer_24 = torch.matmul(attention_probs_17, value_layer_8) + attention_probs_17 = value_layer_8 = None + permute_8 = context_layer_24.permute(0, 2, 1, 3) + context_layer_24 = None + context_layer_25 = permute_8.contiguous() + permute_8 = None + context_layer_26 = context_layer_25.view(1, 11, 768) + context_layer_25 = None + hidden_states_48 = torch._C._nn.linear( + context_layer_26, + weight=l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_dense_parameters_weight_, + bias=l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_dense_parameters_bias_, + ) + context_layer_26 = l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_49 = torch.nn.functional.dropout( + hidden_states_48, 0.1, False, False + ) + hidden_states_48 = None + x_act_18 = x_33 + hidden_states_49 + x_33 = hidden_states_49 = None + mean_51 = x_act_18.mean(axis=2, keepdim=True) + y_17 = x_act_18 - mean_51 + x_act_18 = mean_51 = None + pow_18 = y_17**2 + var_17 = torch.mean(pow_18, axis=2, keepdim=True) + pow_18 = None + add_64 = 1e-05 + var_17 + var_17 = None + sqrt_17 = torch.sqrt(add_64) + add_64 = None + x_34 = y_17 / sqrt_17 + y_17 = sqrt_17 = None + mul_19 = ( + x_34 + * l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_layer_norm_parameters_weight_ + ) + x_34 = l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = (None) + x_35 = ( + mul_19 + + l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_layer_norm_parameters_bias_ + ) + mul_19 = l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = (None) + hidden_states_50 = torch._C._nn.linear( + x_35, + weight=l_self_modules_encoder_modules_layer_modules_8_modules_intermediate_modules_dense_parameters_weight_, + bias=l_self_modules_encoder_modules_layer_modules_8_modules_intermediate_modules_dense_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_8_modules_intermediate_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_8_modules_intermediate_modules_dense_parameters_bias_ = (None) + hidden_states_51 = torch._C._nn.gelu(hidden_states_50, approximate="none") + hidden_states_50 = None + hidden_states_52 = torch._C._nn.linear( + hidden_states_51, + weight=l_self_modules_encoder_modules_layer_modules_8_modules_output_modules_dense_parameters_weight_, + bias=l_self_modules_encoder_modules_layer_modules_8_modules_output_modules_dense_parameters_bias_, + ) + hidden_states_51 = l_self_modules_encoder_modules_layer_modules_8_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_8_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_53 = torch.nn.functional.dropout( + hidden_states_52, 0.1, False, False + ) + hidden_states_52 = None + x_act_19 = x_35 + hidden_states_53 + x_35 = hidden_states_53 = None + mean_54 = x_act_19.mean(axis=2, keepdim=True) + y_18 = x_act_19 - mean_54 + x_act_19 = mean_54 = None + pow_19 = y_18**2 + var_18 = torch.mean(pow_19, axis=2, keepdim=True) + pow_19 = None + add_67 = 1e-05 + var_18 + var_18 = None + sqrt_18 = torch.sqrt(add_67) + add_67 = None + x_36 = y_18 / sqrt_18 + y_18 = sqrt_18 = None + mul_20 = ( + x_36 + * l_self_modules_encoder_modules_layer_modules_8_modules_output_modules_layer_norm_parameters_weight_ + ) + x_36 = l_self_modules_encoder_modules_layer_modules_8_modules_output_modules_layer_norm_parameters_weight_ = (None) + x_37 = ( + mul_20 + + l_self_modules_encoder_modules_layer_modules_8_modules_output_modules_layer_norm_parameters_bias_ + ) + mul_20 = l_self_modules_encoder_modules_layer_modules_8_modules_output_modules_layer_norm_parameters_bias_ = (None) + mixed_query_layer_9 = torch._C._nn.linear( + x_37, + weight=l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_query_parameters_weight_, + bias=l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_query_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_query_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_query_parameters_bias_ = (None) + mixed_key_layer_9 = torch._C._nn.linear( + x_37, + weight=l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_key_parameters_weight_, + bias=l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_key_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_key_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_key_parameters_bias_ = (None) + mixed_value_layer_9 = torch._C._nn.linear( + x_37, + weight=l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_value_parameters_weight_, + bias=l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_value_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_value_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_value_parameters_bias_ = (None) + view_36 = mixed_query_layer_9.view(1, -1, 12, 64) + mixed_query_layer_9 = None + query_layer_9 = view_36.transpose(1, 2) + view_36 = None + view_37 = mixed_key_layer_9.view(1, -1, 12, 64) + mixed_key_layer_9 = None + key_layer_9 = view_37.transpose(1, 2) + view_37 = None + view_38 = mixed_value_layer_9.view(1, -1, 12, 64) + mixed_value_layer_9 = None + value_layer_9 = view_38.transpose(1, 2) + view_38 = None + transpose_39 = key_layer_9.transpose(-1, -2) + key_layer_9 = None + attention_scores_27 = torch.matmul(query_layer_9, transpose_39) + query_layer_9 = transpose_39 = None + attention_scores_28 = attention_scores_27 / 8.0 + attention_scores_27 = None + attention_scores_29 = attention_scores_28 + extended_attention_mask_2 + attention_scores_28 = None + attention_probs_18 = torch.nn.functional.softmax(attention_scores_29, dim=-1) + attention_scores_29 = None + attention_probs_19 = torch.nn.functional.dropout( + attention_probs_18, 0.1, False, False + ) + attention_probs_18 = None + context_layer_27 = torch.matmul(attention_probs_19, value_layer_9) + attention_probs_19 = value_layer_9 = None + permute_9 = context_layer_27.permute(0, 2, 1, 3) + context_layer_27 = None + context_layer_28 = permute_9.contiguous() + permute_9 = None + context_layer_29 = context_layer_28.view(1, 11, 768) + context_layer_28 = None + hidden_states_54 = torch._C._nn.linear( + context_layer_29, + weight=l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_dense_parameters_weight_, + bias=l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_dense_parameters_bias_, + ) + context_layer_29 = l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_55 = torch.nn.functional.dropout( + hidden_states_54, 0.1, False, False + ) + hidden_states_54 = None + x_act_20 = x_37 + hidden_states_55 + x_37 = hidden_states_55 = None + mean_57 = x_act_20.mean(axis=2, keepdim=True) + y_19 = x_act_20 - mean_57 + x_act_20 = mean_57 = None + pow_20 = y_19**2 + var_19 = torch.mean(pow_20, axis=2, keepdim=True) + pow_20 = None + add_71 = 1e-05 + var_19 + var_19 = None + sqrt_19 = torch.sqrt(add_71) + add_71 = None + x_38 = y_19 / sqrt_19 + y_19 = sqrt_19 = None + mul_21 = ( + x_38 + * l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_layer_norm_parameters_weight_ + ) + x_38 = l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = (None) + x_39 = ( + mul_21 + + l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_layer_norm_parameters_bias_ + ) + mul_21 = l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = (None) + hidden_states_56 = torch._C._nn.linear( + x_39, + weight=l_self_modules_encoder_modules_layer_modules_9_modules_intermediate_modules_dense_parameters_weight_, + bias=l_self_modules_encoder_modules_layer_modules_9_modules_intermediate_modules_dense_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_9_modules_intermediate_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_9_modules_intermediate_modules_dense_parameters_bias_ = (None) + hidden_states_57 = torch._C._nn.gelu(hidden_states_56, approximate="none") + hidden_states_56 = None + hidden_states_58 = torch._C._nn.linear( + hidden_states_57, + weight=l_self_modules_encoder_modules_layer_modules_9_modules_output_modules_dense_parameters_weight_, + bias=l_self_modules_encoder_modules_layer_modules_9_modules_output_modules_dense_parameters_bias_, + ) + hidden_states_57 = l_self_modules_encoder_modules_layer_modules_9_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_9_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_59 = torch.nn.functional.dropout( + hidden_states_58, 0.1, False, False + ) + hidden_states_58 = None + x_act_21 = x_39 + hidden_states_59 + x_39 = hidden_states_59 = None + mean_60 = x_act_21.mean(axis=2, keepdim=True) + y_20 = x_act_21 - mean_60 + x_act_21 = mean_60 = None + pow_21 = y_20**2 + var_20 = torch.mean(pow_21, axis=2, keepdim=True) + pow_21 = None + add_74 = 1e-05 + var_20 + var_20 = None + sqrt_20 = torch.sqrt(add_74) + add_74 = None + x_40 = y_20 / sqrt_20 + y_20 = sqrt_20 = None + mul_22 = ( + x_40 + * l_self_modules_encoder_modules_layer_modules_9_modules_output_modules_layer_norm_parameters_weight_ + ) + x_40 = l_self_modules_encoder_modules_layer_modules_9_modules_output_modules_layer_norm_parameters_weight_ = (None) + x_41 = ( + mul_22 + + l_self_modules_encoder_modules_layer_modules_9_modules_output_modules_layer_norm_parameters_bias_ + ) + mul_22 = l_self_modules_encoder_modules_layer_modules_9_modules_output_modules_layer_norm_parameters_bias_ = (None) + mixed_query_layer_10 = torch._C._nn.linear( + x_41, + weight=l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_query_parameters_weight_, + bias=l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_query_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_query_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_query_parameters_bias_ = (None) + mixed_key_layer_10 = torch._C._nn.linear( + x_41, + weight=l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_key_parameters_weight_, + bias=l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_key_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_key_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_key_parameters_bias_ = (None) + mixed_value_layer_10 = torch._C._nn.linear( + x_41, + weight=l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_value_parameters_weight_, + bias=l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_value_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_value_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_value_parameters_bias_ = (None) + view_40 = mixed_query_layer_10.view(1, -1, 12, 64) + mixed_query_layer_10 = None + query_layer_10 = view_40.transpose(1, 2) + view_40 = None + view_41 = mixed_key_layer_10.view(1, -1, 12, 64) + mixed_key_layer_10 = None + key_layer_10 = view_41.transpose(1, 2) + view_41 = None + view_42 = mixed_value_layer_10.view(1, -1, 12, 64) + mixed_value_layer_10 = None + value_layer_10 = view_42.transpose(1, 2) + view_42 = None + transpose_43 = key_layer_10.transpose(-1, -2) + key_layer_10 = None + attention_scores_30 = torch.matmul(query_layer_10, transpose_43) + query_layer_10 = transpose_43 = None + attention_scores_31 = attention_scores_30 / 8.0 + attention_scores_30 = None + attention_scores_32 = attention_scores_31 + extended_attention_mask_2 + attention_scores_31 = None + attention_probs_20 = torch.nn.functional.softmax(attention_scores_32, dim=-1) + attention_scores_32 = None + attention_probs_21 = torch.nn.functional.dropout( + attention_probs_20, 0.1, False, False + ) + attention_probs_20 = None + context_layer_30 = torch.matmul(attention_probs_21, value_layer_10) + attention_probs_21 = value_layer_10 = None + permute_10 = context_layer_30.permute(0, 2, 1, 3) + context_layer_30 = None + context_layer_31 = permute_10.contiguous() + permute_10 = None + context_layer_32 = context_layer_31.view(1, 11, 768) + context_layer_31 = None + hidden_states_60 = torch._C._nn.linear( + context_layer_32, + weight=l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_dense_parameters_weight_, + bias=l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_dense_parameters_bias_, + ) + context_layer_32 = l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_61 = torch.nn.functional.dropout( + hidden_states_60, 0.1, False, False + ) + hidden_states_60 = None + x_act_22 = x_41 + hidden_states_61 + x_41 = hidden_states_61 = None + mean_63 = x_act_22.mean(axis=2, keepdim=True) + y_21 = x_act_22 - mean_63 + x_act_22 = mean_63 = None + pow_22 = y_21**2 + var_21 = torch.mean(pow_22, axis=2, keepdim=True) + pow_22 = None + add_78 = 1e-05 + var_21 + var_21 = None + sqrt_21 = torch.sqrt(add_78) + add_78 = None + x_42 = y_21 / sqrt_21 + y_21 = sqrt_21 = None + mul_23 = ( + x_42 + * l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_layer_norm_parameters_weight_ + ) + x_42 = l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = (None) + x_43 = ( + mul_23 + + l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_layer_norm_parameters_bias_ + ) + mul_23 = l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = (None) + hidden_states_62 = torch._C._nn.linear( + x_43, + weight=l_self_modules_encoder_modules_layer_modules_10_modules_intermediate_modules_dense_parameters_weight_, + bias=l_self_modules_encoder_modules_layer_modules_10_modules_intermediate_modules_dense_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_10_modules_intermediate_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_10_modules_intermediate_modules_dense_parameters_bias_ = (None) + hidden_states_63 = torch._C._nn.gelu(hidden_states_62, approximate="none") + hidden_states_62 = None + hidden_states_64 = torch._C._nn.linear( + hidden_states_63, + weight=l_self_modules_encoder_modules_layer_modules_10_modules_output_modules_dense_parameters_weight_, + bias=l_self_modules_encoder_modules_layer_modules_10_modules_output_modules_dense_parameters_bias_, + ) + hidden_states_63 = l_self_modules_encoder_modules_layer_modules_10_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_10_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_65 = torch.nn.functional.dropout( + hidden_states_64, 0.1, False, False + ) + hidden_states_64 = None + x_act_23 = x_43 + hidden_states_65 + x_43 = hidden_states_65 = None + mean_66 = x_act_23.mean(axis=2, keepdim=True) + y_22 = x_act_23 - mean_66 + x_act_23 = mean_66 = None + pow_23 = y_22**2 + var_22 = torch.mean(pow_23, axis=2, keepdim=True) + pow_23 = None + add_81 = 1e-05 + var_22 + var_22 = None + sqrt_22 = torch.sqrt(add_81) + add_81 = None + x_44 = y_22 / sqrt_22 + y_22 = sqrt_22 = None + mul_24 = ( + x_44 + * l_self_modules_encoder_modules_layer_modules_10_modules_output_modules_layer_norm_parameters_weight_ + ) + x_44 = l_self_modules_encoder_modules_layer_modules_10_modules_output_modules_layer_norm_parameters_weight_ = (None) + x_45 = ( + mul_24 + + l_self_modules_encoder_modules_layer_modules_10_modules_output_modules_layer_norm_parameters_bias_ + ) + mul_24 = l_self_modules_encoder_modules_layer_modules_10_modules_output_modules_layer_norm_parameters_bias_ = (None) + mixed_query_layer_11 = torch._C._nn.linear( + x_45, + weight=l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_query_parameters_weight_, + bias=l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_query_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_query_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_query_parameters_bias_ = (None) + mixed_key_layer_11 = torch._C._nn.linear( + x_45, + weight=l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_key_parameters_weight_, + bias=l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_key_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_key_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_key_parameters_bias_ = (None) + mixed_value_layer_11 = torch._C._nn.linear( + x_45, + weight=l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_value_parameters_weight_, + bias=l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_value_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_value_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_value_parameters_bias_ = (None) + view_44 = mixed_query_layer_11.view(1, -1, 12, 64) + mixed_query_layer_11 = None + query_layer_11 = view_44.transpose(1, 2) + view_44 = None + view_45 = mixed_key_layer_11.view(1, -1, 12, 64) + mixed_key_layer_11 = None + key_layer_11 = view_45.transpose(1, 2) + view_45 = None + view_46 = mixed_value_layer_11.view(1, -1, 12, 64) + mixed_value_layer_11 = None + value_layer_11 = view_46.transpose(1, 2) + view_46 = None + transpose_47 = key_layer_11.transpose(-1, -2) + key_layer_11 = None + attention_scores_33 = torch.matmul(query_layer_11, transpose_47) + query_layer_11 = transpose_47 = None + attention_scores_34 = attention_scores_33 / 8.0 + attention_scores_33 = None + attention_scores_35 = attention_scores_34 + extended_attention_mask_2 + attention_scores_34 = extended_attention_mask_2 = None + attention_probs_22 = torch.nn.functional.softmax(attention_scores_35, dim=-1) + attention_scores_35 = None + attention_probs_23 = torch.nn.functional.dropout( + attention_probs_22, 0.1, False, False + ) + attention_probs_22 = None + context_layer_33 = torch.matmul(attention_probs_23, value_layer_11) + attention_probs_23 = value_layer_11 = None + permute_11 = context_layer_33.permute(0, 2, 1, 3) + context_layer_33 = None + context_layer_34 = permute_11.contiguous() + permute_11 = None + context_layer_35 = context_layer_34.view(1, 11, 768) + context_layer_34 = None + hidden_states_66 = torch._C._nn.linear( + context_layer_35, + weight=l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_dense_parameters_weight_, + bias=l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_dense_parameters_bias_, + ) + context_layer_35 = l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_67 = torch.nn.functional.dropout( + hidden_states_66, 0.1, False, False + ) + hidden_states_66 = None + x_act_24 = x_45 + hidden_states_67 + x_45 = hidden_states_67 = None + mean_69 = x_act_24.mean(axis=2, keepdim=True) + y_23 = x_act_24 - mean_69 + x_act_24 = mean_69 = None + pow_24 = y_23**2 + var_23 = torch.mean(pow_24, axis=2, keepdim=True) + pow_24 = None + add_85 = 1e-05 + var_23 + var_23 = None + sqrt_23 = torch.sqrt(add_85) + add_85 = None + x_46 = y_23 / sqrt_23 + y_23 = sqrt_23 = None + mul_25 = ( + x_46 + * l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_layer_norm_parameters_weight_ + ) + x_46 = l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = (None) + x_47 = ( + mul_25 + + l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_layer_norm_parameters_bias_ + ) + mul_25 = l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = (None) + hidden_states_68 = torch._C._nn.linear( + x_47, + weight=l_self_modules_encoder_modules_layer_modules_11_modules_intermediate_modules_dense_parameters_weight_, + bias=l_self_modules_encoder_modules_layer_modules_11_modules_intermediate_modules_dense_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_11_modules_intermediate_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_11_modules_intermediate_modules_dense_parameters_bias_ = (None) + hidden_states_69 = torch._C._nn.gelu(hidden_states_68, approximate="none") + hidden_states_68 = None + hidden_states_70 = torch._C._nn.linear( + hidden_states_69, + weight=l_self_modules_encoder_modules_layer_modules_11_modules_output_modules_dense_parameters_weight_, + bias=l_self_modules_encoder_modules_layer_modules_11_modules_output_modules_dense_parameters_bias_, + ) + hidden_states_69 = l_self_modules_encoder_modules_layer_modules_11_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_11_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_71 = torch.nn.functional.dropout( + hidden_states_70, 0.1, False, False + ) + hidden_states_70 = None + x_act_25 = x_47 + hidden_states_71 + x_47 = hidden_states_71 = None + mean_72 = x_act_25.mean(axis=2, keepdim=True) + y_24 = x_act_25 - mean_72 + x_act_25 = mean_72 = None + pow_25 = y_24**2 + var_24 = torch.mean(pow_25, axis=2, keepdim=True) + pow_25 = None + add_88 = 1e-05 + var_24 + var_24 = None + sqrt_24 = torch.sqrt(add_88) + add_88 = None + x_48 = y_24 / sqrt_24 + y_24 = sqrt_24 = None + mul_26 = ( + x_48 + * l_self_modules_encoder_modules_layer_modules_11_modules_output_modules_layer_norm_parameters_weight_ + ) + x_48 = l_self_modules_encoder_modules_layer_modules_11_modules_output_modules_layer_norm_parameters_weight_ = (None) + x_49 = ( + mul_26 + + l_self_modules_encoder_modules_layer_modules_11_modules_output_modules_layer_norm_parameters_bias_ + ) + mul_26 = l_self_modules_encoder_modules_layer_modules_11_modules_output_modules_layer_norm_parameters_bias_ = (None) + first_token_tensor = x_49[(slice(None, None, None), 0)] + pooled_output = torch._C._nn.linear( + first_token_tensor, + l_self_modules_pooler_modules_dense_parameters_weight_, + l_self_modules_pooler_modules_dense_parameters_bias_, + ) + first_token_tensor = ( + l_self_modules_pooler_modules_dense_parameters_weight_ + ) = l_self_modules_pooler_modules_dense_parameters_bias_ = None + pooled_output_1 = torch.tanh(pooled_output) + pooled_output = None + return (x_49, pooled_output_1) diff --git a/samples/transformers-auto-model/VitaliiVrublevskyi_ibert-roberta-base-finetuned-mrpc/weight_meta.py b/samples/transformers-auto-model/VitaliiVrublevskyi_ibert-roberta-base-finetuned-mrpc/weight_meta.py new file mode 100644 index 000000000..f18fcac51 --- /dev/null +++ b/samples/transformers-auto-model/VitaliiVrublevskyi_ibert-roberta-base-finetuned-mrpc/weight_meta.py @@ -0,0 +1,2008 @@ +class Program_weight_tensor_meta_L_input_ids_: + name = "L_input_ids_" + shape = [1, 11] + dtype = "torch.int64" + device = "cuda:0" + mean = None + std = None + data = [0, 45288, 15721, 16, 341, 7, 14660, 38163, 36386, 4, 2] + + +class Program_weight_tensor_meta_L_attention_mask_: + name = "L_attention_mask_" + shape = [1, 11] + dtype = "torch.int64" + device = "cuda:0" + mean = None + std = None + data = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] + + +class Program_weight_tensor_meta_L_self_modules_embeddings_modules_word_embeddings_parameters_weight_: + name = "L_self_modules_embeddings_modules_word_embeddings_parameters_weight_" + shape = [50265, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_embeddings_modules_token_type_embeddings_parameters_weight_: + name = "L_self_modules_embeddings_modules_token_type_embeddings_parameters_weight_" + shape = [1, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.001 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_embeddings_modules_position_embeddings_parameters_weight_: + name = "L_self_modules_embeddings_modules_position_embeddings_parameters_weight_" + shape = [514, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_embeddings_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_embeddings_modules_LayerNorm_parameters_weight_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_embeddings_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_embeddings_modules_LayerNorm_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_query_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_query_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_query_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_query_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_key_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_key_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_key_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_key_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_value_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_value_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_value_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_value_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dense_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dense_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_LayerNorm_parameters_weight_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_LayerNorm_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_intermediate_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_intermediate_modules_dense_parameters_weight_" + shape = [3072, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_intermediate_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_intermediate_modules_dense_parameters_bias_" + shape = [3072] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dense_parameters_weight_" + shape = [768, 3072] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dense_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_LayerNorm_parameters_weight_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_LayerNorm_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_query_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_query_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_query_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_query_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_key_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_key_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_key_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_key_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_value_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_value_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_value_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_value_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dense_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dense_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_LayerNorm_parameters_weight_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_LayerNorm_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_intermediate_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_intermediate_modules_dense_parameters_weight_" + shape = [3072, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_intermediate_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_intermediate_modules_dense_parameters_bias_" + shape = [3072] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dense_parameters_weight_" + shape = [768, 3072] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dense_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_LayerNorm_parameters_weight_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_LayerNorm_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_query_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_query_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_query_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_query_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_key_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_key_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_key_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_key_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_value_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_value_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_value_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_value_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_dense_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_dense_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_LayerNorm_parameters_weight_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_LayerNorm_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_intermediate_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_intermediate_modules_dense_parameters_weight_" + shape = [3072, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_intermediate_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_intermediate_modules_dense_parameters_bias_" + shape = [3072] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_dense_parameters_weight_" + shape = [768, 3072] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_dense_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_LayerNorm_parameters_weight_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_LayerNorm_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_query_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_query_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_query_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_query_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_key_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_key_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_key_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_key_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_value_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_value_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_value_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_value_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_dense_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_dense_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_LayerNorm_parameters_weight_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_LayerNorm_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_intermediate_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_intermediate_modules_dense_parameters_weight_" + shape = [3072, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_intermediate_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_intermediate_modules_dense_parameters_bias_" + shape = [3072] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_dense_parameters_weight_" + shape = [768, 3072] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_dense_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_LayerNorm_parameters_weight_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_LayerNorm_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_query_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_query_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_query_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_query_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_key_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_key_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_key_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_key_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_value_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_value_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_value_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_value_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_dense_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_dense_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_LayerNorm_parameters_weight_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_LayerNorm_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_4_modules_intermediate_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_4_modules_intermediate_modules_dense_parameters_weight_" + shape = [3072, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_4_modules_intermediate_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_4_modules_intermediate_modules_dense_parameters_bias_" + shape = [3072] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_4_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_4_modules_output_modules_dense_parameters_weight_" + shape = [768, 3072] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_4_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_4_modules_output_modules_dense_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_4_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_4_modules_output_modules_LayerNorm_parameters_weight_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_4_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_4_modules_output_modules_LayerNorm_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_query_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_query_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_query_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_query_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_key_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_key_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_key_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_key_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_value_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_value_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_value_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_value_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_dense_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_dense_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_LayerNorm_parameters_weight_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_LayerNorm_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_5_modules_intermediate_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_5_modules_intermediate_modules_dense_parameters_weight_" + shape = [3072, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_5_modules_intermediate_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_5_modules_intermediate_modules_dense_parameters_bias_" + shape = [3072] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_5_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_5_modules_output_modules_dense_parameters_weight_" + shape = [768, 3072] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_5_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_5_modules_output_modules_dense_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_5_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_5_modules_output_modules_LayerNorm_parameters_weight_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_5_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_5_modules_output_modules_LayerNorm_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_query_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_query_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_query_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_query_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_key_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_key_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_key_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_key_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_value_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_value_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_value_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_value_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_dense_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_dense_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_LayerNorm_parameters_weight_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_LayerNorm_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_6_modules_intermediate_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_6_modules_intermediate_modules_dense_parameters_weight_" + shape = [3072, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_6_modules_intermediate_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_6_modules_intermediate_modules_dense_parameters_bias_" + shape = [3072] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_6_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_6_modules_output_modules_dense_parameters_weight_" + shape = [768, 3072] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_6_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_6_modules_output_modules_dense_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_6_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_6_modules_output_modules_LayerNorm_parameters_weight_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_6_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_6_modules_output_modules_LayerNorm_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_query_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_query_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_query_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_query_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_key_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_key_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_key_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_key_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_value_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_value_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_value_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_value_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_dense_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_dense_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_LayerNorm_parameters_weight_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_LayerNorm_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_7_modules_intermediate_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_7_modules_intermediate_modules_dense_parameters_weight_" + shape = [3072, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_7_modules_intermediate_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_7_modules_intermediate_modules_dense_parameters_bias_" + shape = [3072] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_7_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_7_modules_output_modules_dense_parameters_weight_" + shape = [768, 3072] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_7_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_7_modules_output_modules_dense_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_7_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_7_modules_output_modules_LayerNorm_parameters_weight_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_7_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_7_modules_output_modules_LayerNorm_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_query_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_query_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_query_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_query_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_key_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_key_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_key_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_key_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_value_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_value_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_value_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_value_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_dense_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_dense_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_LayerNorm_parameters_weight_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_LayerNorm_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_8_modules_intermediate_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_8_modules_intermediate_modules_dense_parameters_weight_" + shape = [3072, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_8_modules_intermediate_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_8_modules_intermediate_modules_dense_parameters_bias_" + shape = [3072] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_8_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_8_modules_output_modules_dense_parameters_weight_" + shape = [768, 3072] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_8_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_8_modules_output_modules_dense_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_8_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_8_modules_output_modules_LayerNorm_parameters_weight_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_8_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_8_modules_output_modules_LayerNorm_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_query_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_query_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_query_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_query_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_key_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_key_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_key_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_key_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_value_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_value_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_value_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_value_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_dense_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_dense_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_LayerNorm_parameters_weight_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_LayerNorm_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_9_modules_intermediate_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_9_modules_intermediate_modules_dense_parameters_weight_" + shape = [3072, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_9_modules_intermediate_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_9_modules_intermediate_modules_dense_parameters_bias_" + shape = [3072] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_9_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_9_modules_output_modules_dense_parameters_weight_" + shape = [768, 3072] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_9_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_9_modules_output_modules_dense_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_9_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_9_modules_output_modules_LayerNorm_parameters_weight_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_9_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_9_modules_output_modules_LayerNorm_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_query_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_query_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_query_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_query_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_key_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_key_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_key_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_key_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_value_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_value_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_value_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_value_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_dense_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_dense_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_LayerNorm_parameters_weight_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_LayerNorm_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_10_modules_intermediate_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_10_modules_intermediate_modules_dense_parameters_weight_" + shape = [3072, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_10_modules_intermediate_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_10_modules_intermediate_modules_dense_parameters_bias_" + shape = [3072] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_10_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_10_modules_output_modules_dense_parameters_weight_" + shape = [768, 3072] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_10_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_10_modules_output_modules_dense_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_10_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_10_modules_output_modules_LayerNorm_parameters_weight_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_10_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_10_modules_output_modules_LayerNorm_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_query_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_query_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_query_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_query_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_key_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_key_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_key_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_key_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_value_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_value_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_value_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_value_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_dense_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_dense_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_LayerNorm_parameters_weight_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_LayerNorm_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_11_modules_intermediate_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_11_modules_intermediate_modules_dense_parameters_weight_" + shape = [3072, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_11_modules_intermediate_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_11_modules_intermediate_modules_dense_parameters_bias_" + shape = [3072] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_11_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_11_modules_output_modules_dense_parameters_weight_" + shape = [768, 3072] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_11_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_11_modules_output_modules_dense_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_11_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_11_modules_output_modules_LayerNorm_parameters_weight_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_11_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_11_modules_output_modules_LayerNorm_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_pooler_modules_dense_parameters_weight_: + name = "L_self_modules_pooler_modules_dense_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_pooler_modules_dense_parameters_bias_: + name = "L_self_modules_pooler_modules_dense_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None diff --git a/samples/transformers-auto-model/andersonbcdefg_seo-spam-classifier/graph_hash.txt b/samples/transformers-auto-model/andersonbcdefg_seo-spam-classifier/graph_hash.txt new file mode 100644 index 000000000..5e0cb1c1d --- /dev/null +++ b/samples/transformers-auto-model/andersonbcdefg_seo-spam-classifier/graph_hash.txt @@ -0,0 +1 @@ +61f526c2cbe77396fcda2ae0c219008e04973c3ad69ea47c000f3cf1ce36c567 \ No newline at end of file diff --git a/samples/transformers-auto-model/andersonbcdefg_seo-spam-classifier/graph_net.json b/samples/transformers-auto-model/andersonbcdefg_seo-spam-classifier/graph_net.json new file mode 100644 index 000000000..b6ffe9f72 --- /dev/null +++ b/samples/transformers-auto-model/andersonbcdefg_seo-spam-classifier/graph_net.json @@ -0,0 +1,6 @@ +{ + "framework": "torch", + "num_devices_required": 1, + "num_nodes_required": 1, + "dynamic": false +} \ No newline at end of file diff --git a/samples/transformers-auto-model/andersonbcdefg_seo-spam-classifier/input_meta.py b/samples/transformers-auto-model/andersonbcdefg_seo-spam-classifier/input_meta.py new file mode 100644 index 000000000..e69de29bb diff --git a/samples/transformers-auto-model/andersonbcdefg_seo-spam-classifier/input_tensor_constraints.py b/samples/transformers-auto-model/andersonbcdefg_seo-spam-classifier/input_tensor_constraints.py new file mode 100644 index 000000000..e69de29bb diff --git a/samples/transformers-auto-model/andersonbcdefg_seo-spam-classifier/model.py b/samples/transformers-auto-model/andersonbcdefg_seo-spam-classifier/model.py new file mode 100644 index 000000000..dd415a162 --- /dev/null +++ b/samples/transformers-auto-model/andersonbcdefg_seo-spam-classifier/model.py @@ -0,0 +1,1690 @@ +import torch + +from torch import device + + +class GraphModule(torch.nn.Module): + def forward( + self, + L_input_ids_: torch.Tensor, + L_attention_mask_: torch.Tensor, + L_self_modules_embeddings_modules_word_embeddings_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_embeddings_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_embeddings_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_rel_embeddings_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_query_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_query_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_key_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_key_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_value_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_value_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_intermediate_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_intermediate_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_query_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_query_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_key_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_key_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_value_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_value_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_intermediate_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_intermediate_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_query_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_query_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_key_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_key_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_value_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_value_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_intermediate_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_intermediate_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_query_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_query_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_key_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_key_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_value_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_value_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_intermediate_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_intermediate_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_query_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_query_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_key_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_key_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_value_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_value_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_4_modules_intermediate_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_4_modules_intermediate_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_4_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_4_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_4_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_4_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_query_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_query_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_key_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_key_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_value_proj_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_value_proj_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_5_modules_intermediate_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_5_modules_intermediate_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_5_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_5_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_5_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_5_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + ): + l_input_ids_ = L_input_ids_ + l_attention_mask_ = L_attention_mask_ + l_self_modules_embeddings_modules_word_embeddings_parameters_weight_ = ( + L_self_modules_embeddings_modules_word_embeddings_parameters_weight_ + ) + l_self_modules_embeddings_modules_layer_norm_parameters_weight_ = ( + L_self_modules_embeddings_modules_LayerNorm_parameters_weight_ + ) + l_self_modules_embeddings_modules_layer_norm_parameters_bias_ = ( + L_self_modules_embeddings_modules_LayerNorm_parameters_bias_ + ) + l_self_modules_encoder_modules_rel_embeddings_parameters_weight_ = ( + L_self_modules_encoder_modules_rel_embeddings_parameters_weight_ + ) + l_self_modules_encoder_modules_layer_norm_parameters_weight_ = ( + L_self_modules_encoder_modules_LayerNorm_parameters_weight_ + ) + l_self_modules_encoder_modules_layer_norm_parameters_bias_ = ( + L_self_modules_encoder_modules_LayerNorm_parameters_bias_ + ) + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_query_proj_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_query_proj_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_query_proj_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_query_proj_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_key_proj_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_key_proj_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_key_proj_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_key_proj_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_value_proj_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_value_proj_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_value_proj_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_value_proj_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_0_modules_intermediate_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_0_modules_intermediate_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_0_modules_intermediate_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_0_modules_intermediate_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_0_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_0_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_query_proj_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_query_proj_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_query_proj_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_query_proj_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_key_proj_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_key_proj_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_key_proj_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_key_proj_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_value_proj_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_value_proj_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_value_proj_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_value_proj_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_1_modules_intermediate_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_1_modules_intermediate_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_1_modules_intermediate_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_1_modules_intermediate_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_1_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_1_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_query_proj_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_query_proj_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_query_proj_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_query_proj_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_key_proj_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_key_proj_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_key_proj_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_key_proj_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_value_proj_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_value_proj_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_value_proj_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_value_proj_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_2_modules_intermediate_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_2_modules_intermediate_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_2_modules_intermediate_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_2_modules_intermediate_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_2_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_2_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_2_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_2_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_query_proj_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_query_proj_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_query_proj_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_query_proj_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_key_proj_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_key_proj_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_key_proj_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_key_proj_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_value_proj_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_value_proj_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_value_proj_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_value_proj_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_3_modules_intermediate_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_3_modules_intermediate_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_3_modules_intermediate_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_3_modules_intermediate_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_3_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_3_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_3_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_3_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_query_proj_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_query_proj_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_query_proj_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_query_proj_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_key_proj_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_key_proj_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_key_proj_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_key_proj_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_value_proj_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_value_proj_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_value_proj_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_value_proj_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_4_modules_intermediate_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_4_modules_intermediate_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_4_modules_intermediate_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_4_modules_intermediate_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_4_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_4_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_4_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_4_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_4_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_4_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_4_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_4_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_query_proj_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_query_proj_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_query_proj_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_query_proj_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_key_proj_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_key_proj_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_key_proj_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_key_proj_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_value_proj_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_value_proj_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_value_proj_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_value_proj_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_5_modules_intermediate_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_5_modules_intermediate_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_5_modules_intermediate_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_5_modules_intermediate_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_5_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_5_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_5_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_5_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_5_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_5_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_5_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_5_modules_output_modules_LayerNorm_parameters_bias_ + inputs_embeds = torch.nn.functional.embedding( + l_input_ids_, + l_self_modules_embeddings_modules_word_embeddings_parameters_weight_, + 0, + None, + 2.0, + False, + False, + ) + l_input_ids_ = ( + l_self_modules_embeddings_modules_word_embeddings_parameters_weight_ + ) = None + position_embeddings = torch.zeros_like(inputs_embeds) + position_embeddings = None + embeddings = torch.nn.functional.layer_norm( + inputs_embeds, + (768,), + l_self_modules_embeddings_modules_layer_norm_parameters_weight_, + l_self_modules_embeddings_modules_layer_norm_parameters_bias_, + 1e-07, + ) + inputs_embeds = ( + l_self_modules_embeddings_modules_layer_norm_parameters_weight_ + ) = l_self_modules_embeddings_modules_layer_norm_parameters_bias_ = None + mask = l_attention_mask_.unsqueeze(2) + mask_1 = mask.to(torch.bfloat16) + mask = None + embeddings_1 = embeddings * mask_1 + embeddings = mask_1 = None + embeddings_2 = torch.nn.functional.dropout(embeddings_1, 0.0, False, False) + embeddings_1 = None + unsqueeze_1 = l_attention_mask_.unsqueeze(1) + l_attention_mask_ = None + extended_attention_mask = unsqueeze_1.unsqueeze(2) + unsqueeze_1 = None + squeeze = extended_attention_mask.squeeze(-2) + unsqueeze_3 = squeeze.unsqueeze(-1) + squeeze = None + attention_mask = extended_attention_mask * unsqueeze_3 + extended_attention_mask = unsqueeze_3 = None + q_ids = torch.arange(11, dtype=torch.int64, device=device(type="cuda", index=0)) + k_ids = torch.arange(11, dtype=torch.int64, device=device(type="cuda", index=0)) + getitem_1 = q_ids[(slice(None, None, None), None)] + q_ids = None + getitem_2 = k_ids[(None, slice(None, None, None))] + k_ids = None + rel_pos_ids = getitem_1 - getitem_2 + getitem_1 = getitem_2 = None + sign = torch.sign(rel_pos_ids) + lt = rel_pos_ids < 128 + gt = rel_pos_ids > -128 + and_ = lt & gt + lt = gt = None + tensor = torch.tensor(127) + type_as = tensor.type_as(rel_pos_ids) + tensor = None + abs_1 = torch.abs(rel_pos_ids) + abs_pos = torch.where(and_, type_as, abs_1) + and_ = type_as = abs_1 = None + truediv = abs_pos / 128 + log = torch.log(truediv) + truediv = None + tensor_1 = torch.tensor(3.9921875) + log_1 = torch.log(tensor_1) + tensor_1 = None + truediv_1 = log / log_1 + log = log_1 = None + mul_2 = truediv_1 * 127 + truediv_1 = None + ceil = torch.ceil(mul_2) + mul_2 = None + log_pos = ceil + 128 + ceil = None + le = abs_pos <= 128 + abs_pos = None + type_as_1 = rel_pos_ids.type_as(log_pos) + rel_pos_ids = None + mul_3 = log_pos * sign + log_pos = sign = None + bucket_pos = torch.where(le, type_as_1, mul_3) + le = type_as_1 = mul_3 = None + rel_pos_ids_1 = bucket_pos.to(torch.int64) + bucket_pos = None + rel_pos_ids_2 = rel_pos_ids_1[(slice(None, 11, None), slice(None, None, None))] + rel_pos_ids_1 = None + rel_pos_ids_3 = rel_pos_ids_2.unsqueeze(0) + rel_pos_ids_2 = None + rel_embeddings = torch.nn.functional.layer_norm( + l_self_modules_encoder_modules_rel_embeddings_parameters_weight_, + (768,), + l_self_modules_encoder_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_norm_parameters_bias_, + 1e-07, + ) + l_self_modules_encoder_modules_rel_embeddings_parameters_weight_ = ( + l_self_modules_encoder_modules_layer_norm_parameters_weight_ + ) = l_self_modules_encoder_modules_layer_norm_parameters_bias_ = None + linear = torch._C._nn.linear( + embeddings_2, + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_query_proj_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_query_proj_parameters_bias_, + ) + x = linear.view((1, 11, 12, -1)) + linear = None + permute = x.permute(0, 2, 1, 3) + x = None + contiguous = permute.contiguous() + permute = None + query_layer = contiguous.view(-1, 11, 64) + contiguous = None + linear_1 = torch._C._nn.linear( + embeddings_2, + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_key_proj_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_key_proj_parameters_bias_, + ) + x_1 = linear_1.view((1, 11, 12, -1)) + linear_1 = None + permute_1 = x_1.permute(0, 2, 1, 3) + x_1 = None + contiguous_1 = permute_1.contiguous() + permute_1 = None + key_layer = contiguous_1.view(-1, 11, 64) + contiguous_1 = None + linear_2 = torch._C._nn.linear( + embeddings_2, + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_value_proj_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_value_proj_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_value_proj_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_value_proj_parameters_bias_ = (None) + x_2 = linear_2.view((1, 11, 12, -1)) + linear_2 = None + permute_2 = x_2.permute(0, 2, 1, 3) + x_2 = None + contiguous_2 = permute_2.contiguous() + permute_2 = None + value_layer = contiguous_2.view(-1, 11, 64) + contiguous_2 = None + tensor_2 = torch.tensor(64, dtype=torch.float32) + mul_4 = tensor_2 * 3 + tensor_2 = None + scale = torch.sqrt(mul_4) + mul_4 = None + transpose = key_layer.transpose(-1, -2) + to_2 = scale.to(dtype=torch.bfloat16) + scale = None + truediv_2 = transpose / to_2 + transpose = to_2 = None + attention_scores = torch.bmm(query_layer, truediv_2) + truediv_2 = None + rel_embeddings_1 = torch.nn.functional.dropout( + rel_embeddings, 0.0, False, False + ) + relative_pos = rel_pos_ids_3.unsqueeze(1) + relative_pos_1 = relative_pos.to( + device=device(type="cuda", index=0), dtype=torch.int64 + ) + relative_pos = None + getitem_4 = rel_embeddings_1[(slice(0, 512, None), slice(None, None, None))] + rel_embeddings_1 = None + rel_embeddings_2 = getitem_4.unsqueeze(0) + getitem_4 = None + linear_3 = torch._C._nn.linear( + rel_embeddings_2, + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_query_proj_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_query_proj_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_query_proj_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_query_proj_parameters_bias_ = (None) + x_3 = linear_3.view((1, 512, 12, -1)) + linear_3 = None + permute_3 = x_3.permute(0, 2, 1, 3) + x_3 = None + contiguous_3 = permute_3.contiguous() + permute_3 = None + view_7 = contiguous_3.view(-1, 512, 64) + contiguous_3 = None + pos_query_layer = view_7.repeat(1, 1, 1) + view_7 = None + linear_4 = torch._C._nn.linear( + rel_embeddings_2, + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_key_proj_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_key_proj_parameters_bias_, + ) + rel_embeddings_2 = l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_key_proj_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_key_proj_parameters_bias_ = (None) + x_4 = linear_4.view((1, 512, 12, -1)) + linear_4 = None + permute_4 = x_4.permute(0, 2, 1, 3) + x_4 = None + contiguous_4 = permute_4.contiguous() + permute_4 = None + view_9 = contiguous_4.view(-1, 512, 64) + contiguous_4 = None + pos_key_layer = view_9.repeat(1, 1, 1) + view_9 = None + tensor_3 = torch.tensor(64, dtype=torch.float32) + mul_5 = tensor_3 * 3 + tensor_3 = None + scale_1 = torch.sqrt(mul_5) + mul_5 = None + transpose_1 = pos_key_layer.transpose(-1, -2) + pos_key_layer = None + c2p_att = torch.bmm(query_layer, transpose_1) + query_layer = transpose_1 = None + add_1 = relative_pos_1 + 256 + c2p_pos = torch.clamp(add_1, 0, 511) + add_1 = None + squeeze_1 = c2p_pos.squeeze(0) + c2p_pos = None + expand = squeeze_1.expand([12, 11, 11]) + squeeze_1 = None + c2p_att_1 = torch.gather(c2p_att, dim=-1, index=expand) + c2p_att = expand = None + to_4 = scale_1.to(dtype=torch.bfloat16) + scale_1 = None + truediv_3 = c2p_att_1 / to_4 + c2p_att_1 = to_4 = None + score = 0 + truediv_3 + truediv_3 = None + tensor_4 = torch.tensor(64, dtype=torch.float32) + mul_6 = tensor_4 * 3 + tensor_4 = None + scale_2 = torch.sqrt(mul_6) + mul_6 = None + neg = -relative_pos_1 + relative_pos_1 = None + add_3 = neg + 256 + neg = None + p2c_pos = torch.clamp(add_3, 0, 511) + add_3 = None + transpose_2 = pos_query_layer.transpose(-1, -2) + pos_query_layer = None + p2c_att = torch.bmm(key_layer, transpose_2) + key_layer = transpose_2 = None + squeeze_2 = p2c_pos.squeeze(0) + p2c_pos = None + expand_1 = squeeze_2.expand([12, 11, 11]) + squeeze_2 = None + gather_1 = torch.gather(p2c_att, dim=-1, index=expand_1) + p2c_att = expand_1 = None + p2c_att_1 = gather_1.transpose(-1, -2) + gather_1 = None + to_5 = scale_2.to(dtype=torch.bfloat16) + scale_2 = None + truediv_4 = p2c_att_1 / to_5 + p2c_att_1 = to_5 = None + score += truediv_4 + score_1 = score + score = truediv_4 = None + attention_scores_1 = attention_scores + score_1 + attention_scores = score_1 = None + attention_scores_2 = attention_scores_1.view(-1, 12, 11, 11) + attention_scores_1 = None + attention_mask_1 = attention_mask.bool() + invert = ~attention_mask_1 + attention_mask_1 = None + attention_scores_3 = attention_scores_2.masked_fill( + invert, -3.3895313892515355e38 + ) + attention_scores_2 = invert = None + attention_probs = torch.nn.functional.softmax(attention_scores_3, dim=-1) + attention_scores_3 = None + attention_probs_1 = torch.nn.functional.dropout( + attention_probs, 0.0, False, False + ) + attention_probs = None + view_11 = attention_probs_1.view(-1, 11, 11) + attention_probs_1 = None + context_layer = torch.bmm(view_11, value_layer) + view_11 = value_layer = None + view_12 = context_layer.view(-1, 12, 11, 64) + context_layer = None + permute_5 = view_12.permute(0, 2, 1, 3) + view_12 = None + context_layer_1 = permute_5.contiguous() + permute_5 = None + context_layer_2 = context_layer_1.view((1, 11, -1)) + context_layer_1 = None + hidden_states = torch._C._nn.linear( + context_layer_2, + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dense_parameters_bias_, + ) + context_layer_2 = l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_1 = torch.nn.functional.dropout(hidden_states, 0.0, False, False) + hidden_states = None + add_5 = hidden_states_1 + embeddings_2 + hidden_states_1 = embeddings_2 = None + hidden_states_2 = torch.nn.functional.layer_norm( + add_5, + (768,), + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_layer_norm_parameters_bias_, + 1e-07, + ) + add_5 = l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = (None) + hidden_states_3 = torch._C._nn.linear( + hidden_states_2, + l_self_modules_encoder_modules_layer_modules_0_modules_intermediate_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_0_modules_intermediate_modules_dense_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_0_modules_intermediate_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_0_modules_intermediate_modules_dense_parameters_bias_ = (None) + hidden_states_4 = torch._C._nn.gelu(hidden_states_3) + hidden_states_3 = None + hidden_states_5 = torch._C._nn.linear( + hidden_states_4, + l_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dense_parameters_bias_, + ) + hidden_states_4 = l_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_6 = torch.nn.functional.dropout( + hidden_states_5, 0.0, False, False + ) + hidden_states_5 = None + add_6 = hidden_states_6 + hidden_states_2 + hidden_states_6 = hidden_states_2 = None + hidden_states_7 = torch.nn.functional.layer_norm( + add_6, + (768,), + l_self_modules_encoder_modules_layer_modules_0_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_0_modules_output_modules_layer_norm_parameters_bias_, + 1e-07, + ) + add_6 = l_self_modules_encoder_modules_layer_modules_0_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_0_modules_output_modules_layer_norm_parameters_bias_ = (None) + linear_8 = torch._C._nn.linear( + hidden_states_7, + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_query_proj_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_query_proj_parameters_bias_, + ) + x_5 = linear_8.view((1, 11, 12, -1)) + linear_8 = None + permute_6 = x_5.permute(0, 2, 1, 3) + x_5 = None + contiguous_6 = permute_6.contiguous() + permute_6 = None + query_layer_1 = contiguous_6.view(-1, 11, 64) + contiguous_6 = None + linear_9 = torch._C._nn.linear( + hidden_states_7, + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_key_proj_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_key_proj_parameters_bias_, + ) + x_6 = linear_9.view((1, 11, 12, -1)) + linear_9 = None + permute_7 = x_6.permute(0, 2, 1, 3) + x_6 = None + contiguous_7 = permute_7.contiguous() + permute_7 = None + key_layer_1 = contiguous_7.view(-1, 11, 64) + contiguous_7 = None + linear_10 = torch._C._nn.linear( + hidden_states_7, + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_value_proj_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_value_proj_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_value_proj_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_value_proj_parameters_bias_ = (None) + x_7 = linear_10.view((1, 11, 12, -1)) + linear_10 = None + permute_8 = x_7.permute(0, 2, 1, 3) + x_7 = None + contiguous_8 = permute_8.contiguous() + permute_8 = None + value_layer_1 = contiguous_8.view(-1, 11, 64) + contiguous_8 = None + tensor_5 = torch.tensor(64, dtype=torch.float32) + mul_7 = tensor_5 * 3 + tensor_5 = None + scale_3 = torch.sqrt(mul_7) + mul_7 = None + transpose_4 = key_layer_1.transpose(-1, -2) + to_6 = scale_3.to(dtype=torch.bfloat16) + scale_3 = None + truediv_5 = transpose_4 / to_6 + transpose_4 = to_6 = None + attention_scores_4 = torch.bmm(query_layer_1, truediv_5) + truediv_5 = None + rel_embeddings_3 = torch.nn.functional.dropout( + rel_embeddings, 0.0, False, False + ) + relative_pos_2 = rel_pos_ids_3.unsqueeze(1) + relative_pos_3 = relative_pos_2.to( + device=device(type="cuda", index=0), dtype=torch.int64 + ) + relative_pos_2 = None + getitem_5 = rel_embeddings_3[(slice(0, 512, None), slice(None, None, None))] + rel_embeddings_3 = None + rel_embeddings_4 = getitem_5.unsqueeze(0) + getitem_5 = None + linear_11 = torch._C._nn.linear( + rel_embeddings_4, + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_query_proj_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_query_proj_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_query_proj_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_query_proj_parameters_bias_ = (None) + x_8 = linear_11.view((1, 512, 12, -1)) + linear_11 = None + permute_9 = x_8.permute(0, 2, 1, 3) + x_8 = None + contiguous_9 = permute_9.contiguous() + permute_9 = None + view_21 = contiguous_9.view(-1, 512, 64) + contiguous_9 = None + pos_query_layer_1 = view_21.repeat(1, 1, 1) + view_21 = None + linear_12 = torch._C._nn.linear( + rel_embeddings_4, + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_key_proj_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_key_proj_parameters_bias_, + ) + rel_embeddings_4 = l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_key_proj_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_key_proj_parameters_bias_ = (None) + x_9 = linear_12.view((1, 512, 12, -1)) + linear_12 = None + permute_10 = x_9.permute(0, 2, 1, 3) + x_9 = None + contiguous_10 = permute_10.contiguous() + permute_10 = None + view_23 = contiguous_10.view(-1, 512, 64) + contiguous_10 = None + pos_key_layer_1 = view_23.repeat(1, 1, 1) + view_23 = None + tensor_6 = torch.tensor(64, dtype=torch.float32) + mul_8 = tensor_6 * 3 + tensor_6 = None + scale_4 = torch.sqrt(mul_8) + mul_8 = None + transpose_5 = pos_key_layer_1.transpose(-1, -2) + pos_key_layer_1 = None + c2p_att_2 = torch.bmm(query_layer_1, transpose_5) + query_layer_1 = transpose_5 = None + add_7 = relative_pos_3 + 256 + c2p_pos_1 = torch.clamp(add_7, 0, 511) + add_7 = None + squeeze_3 = c2p_pos_1.squeeze(0) + c2p_pos_1 = None + expand_2 = squeeze_3.expand([12, 11, 11]) + squeeze_3 = None + c2p_att_3 = torch.gather(c2p_att_2, dim=-1, index=expand_2) + c2p_att_2 = expand_2 = None + to_8 = scale_4.to(dtype=torch.bfloat16) + scale_4 = None + truediv_6 = c2p_att_3 / to_8 + c2p_att_3 = to_8 = None + score_2 = 0 + truediv_6 + truediv_6 = None + tensor_7 = torch.tensor(64, dtype=torch.float32) + mul_9 = tensor_7 * 3 + tensor_7 = None + scale_5 = torch.sqrt(mul_9) + mul_9 = None + neg_1 = -relative_pos_3 + relative_pos_3 = None + add_9 = neg_1 + 256 + neg_1 = None + p2c_pos_1 = torch.clamp(add_9, 0, 511) + add_9 = None + transpose_6 = pos_query_layer_1.transpose(-1, -2) + pos_query_layer_1 = None + p2c_att_2 = torch.bmm(key_layer_1, transpose_6) + key_layer_1 = transpose_6 = None + squeeze_4 = p2c_pos_1.squeeze(0) + p2c_pos_1 = None + expand_3 = squeeze_4.expand([12, 11, 11]) + squeeze_4 = None + gather_3 = torch.gather(p2c_att_2, dim=-1, index=expand_3) + p2c_att_2 = expand_3 = None + p2c_att_3 = gather_3.transpose(-1, -2) + gather_3 = None + to_9 = scale_5.to(dtype=torch.bfloat16) + scale_5 = None + truediv_7 = p2c_att_3 / to_9 + p2c_att_3 = to_9 = None + score_2 += truediv_7 + score_3 = score_2 + score_2 = truediv_7 = None + attention_scores_5 = attention_scores_4 + score_3 + attention_scores_4 = score_3 = None + attention_scores_6 = attention_scores_5.view(-1, 12, 11, 11) + attention_scores_5 = None + attention_mask_2 = attention_mask.bool() + invert_1 = ~attention_mask_2 + attention_mask_2 = None + attention_scores_7 = attention_scores_6.masked_fill( + invert_1, -3.3895313892515355e38 + ) + attention_scores_6 = invert_1 = None + attention_probs_2 = torch.nn.functional.softmax(attention_scores_7, dim=-1) + attention_scores_7 = None + attention_probs_3 = torch.nn.functional.dropout( + attention_probs_2, 0.0, False, False + ) + attention_probs_2 = None + view_25 = attention_probs_3.view(-1, 11, 11) + attention_probs_3 = None + context_layer_3 = torch.bmm(view_25, value_layer_1) + view_25 = value_layer_1 = None + view_26 = context_layer_3.view(-1, 12, 11, 64) + context_layer_3 = None + permute_11 = view_26.permute(0, 2, 1, 3) + view_26 = None + context_layer_4 = permute_11.contiguous() + permute_11 = None + context_layer_5 = context_layer_4.view((1, 11, -1)) + context_layer_4 = None + hidden_states_8 = torch._C._nn.linear( + context_layer_5, + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dense_parameters_bias_, + ) + context_layer_5 = l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_9 = torch.nn.functional.dropout( + hidden_states_8, 0.0, False, False + ) + hidden_states_8 = None + add_11 = hidden_states_9 + hidden_states_7 + hidden_states_9 = hidden_states_7 = None + hidden_states_10 = torch.nn.functional.layer_norm( + add_11, + (768,), + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_layer_norm_parameters_bias_, + 1e-07, + ) + add_11 = l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = (None) + hidden_states_11 = torch._C._nn.linear( + hidden_states_10, + l_self_modules_encoder_modules_layer_modules_1_modules_intermediate_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_1_modules_intermediate_modules_dense_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_1_modules_intermediate_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_1_modules_intermediate_modules_dense_parameters_bias_ = (None) + hidden_states_12 = torch._C._nn.gelu(hidden_states_11) + hidden_states_11 = None + hidden_states_13 = torch._C._nn.linear( + hidden_states_12, + l_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dense_parameters_bias_, + ) + hidden_states_12 = l_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_14 = torch.nn.functional.dropout( + hidden_states_13, 0.0, False, False + ) + hidden_states_13 = None + add_12 = hidden_states_14 + hidden_states_10 + hidden_states_14 = hidden_states_10 = None + hidden_states_15 = torch.nn.functional.layer_norm( + add_12, + (768,), + l_self_modules_encoder_modules_layer_modules_1_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_1_modules_output_modules_layer_norm_parameters_bias_, + 1e-07, + ) + add_12 = l_self_modules_encoder_modules_layer_modules_1_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_1_modules_output_modules_layer_norm_parameters_bias_ = (None) + linear_16 = torch._C._nn.linear( + hidden_states_15, + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_query_proj_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_query_proj_parameters_bias_, + ) + x_10 = linear_16.view((1, 11, 12, -1)) + linear_16 = None + permute_12 = x_10.permute(0, 2, 1, 3) + x_10 = None + contiguous_12 = permute_12.contiguous() + permute_12 = None + query_layer_2 = contiguous_12.view(-1, 11, 64) + contiguous_12 = None + linear_17 = torch._C._nn.linear( + hidden_states_15, + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_key_proj_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_key_proj_parameters_bias_, + ) + x_11 = linear_17.view((1, 11, 12, -1)) + linear_17 = None + permute_13 = x_11.permute(0, 2, 1, 3) + x_11 = None + contiguous_13 = permute_13.contiguous() + permute_13 = None + key_layer_2 = contiguous_13.view(-1, 11, 64) + contiguous_13 = None + linear_18 = torch._C._nn.linear( + hidden_states_15, + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_value_proj_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_value_proj_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_value_proj_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_value_proj_parameters_bias_ = (None) + x_12 = linear_18.view((1, 11, 12, -1)) + linear_18 = None + permute_14 = x_12.permute(0, 2, 1, 3) + x_12 = None + contiguous_14 = permute_14.contiguous() + permute_14 = None + value_layer_2 = contiguous_14.view(-1, 11, 64) + contiguous_14 = None + tensor_8 = torch.tensor(64, dtype=torch.float32) + mul_10 = tensor_8 * 3 + tensor_8 = None + scale_6 = torch.sqrt(mul_10) + mul_10 = None + transpose_8 = key_layer_2.transpose(-1, -2) + to_10 = scale_6.to(dtype=torch.bfloat16) + scale_6 = None + truediv_8 = transpose_8 / to_10 + transpose_8 = to_10 = None + attention_scores_8 = torch.bmm(query_layer_2, truediv_8) + truediv_8 = None + rel_embeddings_5 = torch.nn.functional.dropout( + rel_embeddings, 0.0, False, False + ) + relative_pos_4 = rel_pos_ids_3.unsqueeze(1) + relative_pos_5 = relative_pos_4.to( + device=device(type="cuda", index=0), dtype=torch.int64 + ) + relative_pos_4 = None + getitem_6 = rel_embeddings_5[(slice(0, 512, None), slice(None, None, None))] + rel_embeddings_5 = None + rel_embeddings_6 = getitem_6.unsqueeze(0) + getitem_6 = None + linear_19 = torch._C._nn.linear( + rel_embeddings_6, + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_query_proj_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_query_proj_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_query_proj_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_query_proj_parameters_bias_ = (None) + x_13 = linear_19.view((1, 512, 12, -1)) + linear_19 = None + permute_15 = x_13.permute(0, 2, 1, 3) + x_13 = None + contiguous_15 = permute_15.contiguous() + permute_15 = None + view_35 = contiguous_15.view(-1, 512, 64) + contiguous_15 = None + pos_query_layer_2 = view_35.repeat(1, 1, 1) + view_35 = None + linear_20 = torch._C._nn.linear( + rel_embeddings_6, + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_key_proj_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_key_proj_parameters_bias_, + ) + rel_embeddings_6 = l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_key_proj_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_key_proj_parameters_bias_ = (None) + x_14 = linear_20.view((1, 512, 12, -1)) + linear_20 = None + permute_16 = x_14.permute(0, 2, 1, 3) + x_14 = None + contiguous_16 = permute_16.contiguous() + permute_16 = None + view_37 = contiguous_16.view(-1, 512, 64) + contiguous_16 = None + pos_key_layer_2 = view_37.repeat(1, 1, 1) + view_37 = None + tensor_9 = torch.tensor(64, dtype=torch.float32) + mul_11 = tensor_9 * 3 + tensor_9 = None + scale_7 = torch.sqrt(mul_11) + mul_11 = None + transpose_9 = pos_key_layer_2.transpose(-1, -2) + pos_key_layer_2 = None + c2p_att_4 = torch.bmm(query_layer_2, transpose_9) + query_layer_2 = transpose_9 = None + add_13 = relative_pos_5 + 256 + c2p_pos_2 = torch.clamp(add_13, 0, 511) + add_13 = None + squeeze_5 = c2p_pos_2.squeeze(0) + c2p_pos_2 = None + expand_4 = squeeze_5.expand([12, 11, 11]) + squeeze_5 = None + c2p_att_5 = torch.gather(c2p_att_4, dim=-1, index=expand_4) + c2p_att_4 = expand_4 = None + to_12 = scale_7.to(dtype=torch.bfloat16) + scale_7 = None + truediv_9 = c2p_att_5 / to_12 + c2p_att_5 = to_12 = None + score_4 = 0 + truediv_9 + truediv_9 = None + tensor_10 = torch.tensor(64, dtype=torch.float32) + mul_12 = tensor_10 * 3 + tensor_10 = None + scale_8 = torch.sqrt(mul_12) + mul_12 = None + neg_2 = -relative_pos_5 + relative_pos_5 = None + add_15 = neg_2 + 256 + neg_2 = None + p2c_pos_2 = torch.clamp(add_15, 0, 511) + add_15 = None + transpose_10 = pos_query_layer_2.transpose(-1, -2) + pos_query_layer_2 = None + p2c_att_4 = torch.bmm(key_layer_2, transpose_10) + key_layer_2 = transpose_10 = None + squeeze_6 = p2c_pos_2.squeeze(0) + p2c_pos_2 = None + expand_5 = squeeze_6.expand([12, 11, 11]) + squeeze_6 = None + gather_5 = torch.gather(p2c_att_4, dim=-1, index=expand_5) + p2c_att_4 = expand_5 = None + p2c_att_5 = gather_5.transpose(-1, -2) + gather_5 = None + to_13 = scale_8.to(dtype=torch.bfloat16) + scale_8 = None + truediv_10 = p2c_att_5 / to_13 + p2c_att_5 = to_13 = None + score_4 += truediv_10 + score_5 = score_4 + score_4 = truediv_10 = None + attention_scores_9 = attention_scores_8 + score_5 + attention_scores_8 = score_5 = None + attention_scores_10 = attention_scores_9.view(-1, 12, 11, 11) + attention_scores_9 = None + attention_mask_3 = attention_mask.bool() + invert_2 = ~attention_mask_3 + attention_mask_3 = None + attention_scores_11 = attention_scores_10.masked_fill( + invert_2, -3.3895313892515355e38 + ) + attention_scores_10 = invert_2 = None + attention_probs_4 = torch.nn.functional.softmax(attention_scores_11, dim=-1) + attention_scores_11 = None + attention_probs_5 = torch.nn.functional.dropout( + attention_probs_4, 0.0, False, False + ) + attention_probs_4 = None + view_39 = attention_probs_5.view(-1, 11, 11) + attention_probs_5 = None + context_layer_6 = torch.bmm(view_39, value_layer_2) + view_39 = value_layer_2 = None + view_40 = context_layer_6.view(-1, 12, 11, 64) + context_layer_6 = None + permute_17 = view_40.permute(0, 2, 1, 3) + view_40 = None + context_layer_7 = permute_17.contiguous() + permute_17 = None + context_layer_8 = context_layer_7.view((1, 11, -1)) + context_layer_7 = None + hidden_states_16 = torch._C._nn.linear( + context_layer_8, + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_dense_parameters_bias_, + ) + context_layer_8 = l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_17 = torch.nn.functional.dropout( + hidden_states_16, 0.0, False, False + ) + hidden_states_16 = None + add_17 = hidden_states_17 + hidden_states_15 + hidden_states_17 = hidden_states_15 = None + hidden_states_18 = torch.nn.functional.layer_norm( + add_17, + (768,), + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_layer_norm_parameters_bias_, + 1e-07, + ) + add_17 = l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = (None) + hidden_states_19 = torch._C._nn.linear( + hidden_states_18, + l_self_modules_encoder_modules_layer_modules_2_modules_intermediate_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_2_modules_intermediate_modules_dense_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_2_modules_intermediate_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_2_modules_intermediate_modules_dense_parameters_bias_ = (None) + hidden_states_20 = torch._C._nn.gelu(hidden_states_19) + hidden_states_19 = None + hidden_states_21 = torch._C._nn.linear( + hidden_states_20, + l_self_modules_encoder_modules_layer_modules_2_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_2_modules_output_modules_dense_parameters_bias_, + ) + hidden_states_20 = l_self_modules_encoder_modules_layer_modules_2_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_2_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_22 = torch.nn.functional.dropout( + hidden_states_21, 0.0, False, False + ) + hidden_states_21 = None + add_18 = hidden_states_22 + hidden_states_18 + hidden_states_22 = hidden_states_18 = None + hidden_states_23 = torch.nn.functional.layer_norm( + add_18, + (768,), + l_self_modules_encoder_modules_layer_modules_2_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_2_modules_output_modules_layer_norm_parameters_bias_, + 1e-07, + ) + add_18 = l_self_modules_encoder_modules_layer_modules_2_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_2_modules_output_modules_layer_norm_parameters_bias_ = (None) + linear_24 = torch._C._nn.linear( + hidden_states_23, + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_query_proj_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_query_proj_parameters_bias_, + ) + x_15 = linear_24.view((1, 11, 12, -1)) + linear_24 = None + permute_18 = x_15.permute(0, 2, 1, 3) + x_15 = None + contiguous_18 = permute_18.contiguous() + permute_18 = None + query_layer_3 = contiguous_18.view(-1, 11, 64) + contiguous_18 = None + linear_25 = torch._C._nn.linear( + hidden_states_23, + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_key_proj_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_key_proj_parameters_bias_, + ) + x_16 = linear_25.view((1, 11, 12, -1)) + linear_25 = None + permute_19 = x_16.permute(0, 2, 1, 3) + x_16 = None + contiguous_19 = permute_19.contiguous() + permute_19 = None + key_layer_3 = contiguous_19.view(-1, 11, 64) + contiguous_19 = None + linear_26 = torch._C._nn.linear( + hidden_states_23, + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_value_proj_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_value_proj_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_value_proj_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_value_proj_parameters_bias_ = (None) + x_17 = linear_26.view((1, 11, 12, -1)) + linear_26 = None + permute_20 = x_17.permute(0, 2, 1, 3) + x_17 = None + contiguous_20 = permute_20.contiguous() + permute_20 = None + value_layer_3 = contiguous_20.view(-1, 11, 64) + contiguous_20 = None + tensor_11 = torch.tensor(64, dtype=torch.float32) + mul_13 = tensor_11 * 3 + tensor_11 = None + scale_9 = torch.sqrt(mul_13) + mul_13 = None + transpose_12 = key_layer_3.transpose(-1, -2) + to_14 = scale_9.to(dtype=torch.bfloat16) + scale_9 = None + truediv_11 = transpose_12 / to_14 + transpose_12 = to_14 = None + attention_scores_12 = torch.bmm(query_layer_3, truediv_11) + truediv_11 = None + rel_embeddings_7 = torch.nn.functional.dropout( + rel_embeddings, 0.0, False, False + ) + relative_pos_6 = rel_pos_ids_3.unsqueeze(1) + relative_pos_7 = relative_pos_6.to( + device=device(type="cuda", index=0), dtype=torch.int64 + ) + relative_pos_6 = None + getitem_7 = rel_embeddings_7[(slice(0, 512, None), slice(None, None, None))] + rel_embeddings_7 = None + rel_embeddings_8 = getitem_7.unsqueeze(0) + getitem_7 = None + linear_27 = torch._C._nn.linear( + rel_embeddings_8, + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_query_proj_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_query_proj_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_query_proj_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_query_proj_parameters_bias_ = (None) + x_18 = linear_27.view((1, 512, 12, -1)) + linear_27 = None + permute_21 = x_18.permute(0, 2, 1, 3) + x_18 = None + contiguous_21 = permute_21.contiguous() + permute_21 = None + view_49 = contiguous_21.view(-1, 512, 64) + contiguous_21 = None + pos_query_layer_3 = view_49.repeat(1, 1, 1) + view_49 = None + linear_28 = torch._C._nn.linear( + rel_embeddings_8, + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_key_proj_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_key_proj_parameters_bias_, + ) + rel_embeddings_8 = l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_key_proj_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_key_proj_parameters_bias_ = (None) + x_19 = linear_28.view((1, 512, 12, -1)) + linear_28 = None + permute_22 = x_19.permute(0, 2, 1, 3) + x_19 = None + contiguous_22 = permute_22.contiguous() + permute_22 = None + view_51 = contiguous_22.view(-1, 512, 64) + contiguous_22 = None + pos_key_layer_3 = view_51.repeat(1, 1, 1) + view_51 = None + tensor_12 = torch.tensor(64, dtype=torch.float32) + mul_14 = tensor_12 * 3 + tensor_12 = None + scale_10 = torch.sqrt(mul_14) + mul_14 = None + transpose_13 = pos_key_layer_3.transpose(-1, -2) + pos_key_layer_3 = None + c2p_att_6 = torch.bmm(query_layer_3, transpose_13) + query_layer_3 = transpose_13 = None + add_19 = relative_pos_7 + 256 + c2p_pos_3 = torch.clamp(add_19, 0, 511) + add_19 = None + squeeze_7 = c2p_pos_3.squeeze(0) + c2p_pos_3 = None + expand_6 = squeeze_7.expand([12, 11, 11]) + squeeze_7 = None + c2p_att_7 = torch.gather(c2p_att_6, dim=-1, index=expand_6) + c2p_att_6 = expand_6 = None + to_16 = scale_10.to(dtype=torch.bfloat16) + scale_10 = None + truediv_12 = c2p_att_7 / to_16 + c2p_att_7 = to_16 = None + score_6 = 0 + truediv_12 + truediv_12 = None + tensor_13 = torch.tensor(64, dtype=torch.float32) + mul_15 = tensor_13 * 3 + tensor_13 = None + scale_11 = torch.sqrt(mul_15) + mul_15 = None + neg_3 = -relative_pos_7 + relative_pos_7 = None + add_21 = neg_3 + 256 + neg_3 = None + p2c_pos_3 = torch.clamp(add_21, 0, 511) + add_21 = None + transpose_14 = pos_query_layer_3.transpose(-1, -2) + pos_query_layer_3 = None + p2c_att_6 = torch.bmm(key_layer_3, transpose_14) + key_layer_3 = transpose_14 = None + squeeze_8 = p2c_pos_3.squeeze(0) + p2c_pos_3 = None + expand_7 = squeeze_8.expand([12, 11, 11]) + squeeze_8 = None + gather_7 = torch.gather(p2c_att_6, dim=-1, index=expand_7) + p2c_att_6 = expand_7 = None + p2c_att_7 = gather_7.transpose(-1, -2) + gather_7 = None + to_17 = scale_11.to(dtype=torch.bfloat16) + scale_11 = None + truediv_13 = p2c_att_7 / to_17 + p2c_att_7 = to_17 = None + score_6 += truediv_13 + score_7 = score_6 + score_6 = truediv_13 = None + attention_scores_13 = attention_scores_12 + score_7 + attention_scores_12 = score_7 = None + attention_scores_14 = attention_scores_13.view(-1, 12, 11, 11) + attention_scores_13 = None + attention_mask_4 = attention_mask.bool() + invert_3 = ~attention_mask_4 + attention_mask_4 = None + attention_scores_15 = attention_scores_14.masked_fill( + invert_3, -3.3895313892515355e38 + ) + attention_scores_14 = invert_3 = None + attention_probs_6 = torch.nn.functional.softmax(attention_scores_15, dim=-1) + attention_scores_15 = None + attention_probs_7 = torch.nn.functional.dropout( + attention_probs_6, 0.0, False, False + ) + attention_probs_6 = None + view_53 = attention_probs_7.view(-1, 11, 11) + attention_probs_7 = None + context_layer_9 = torch.bmm(view_53, value_layer_3) + view_53 = value_layer_3 = None + view_54 = context_layer_9.view(-1, 12, 11, 64) + context_layer_9 = None + permute_23 = view_54.permute(0, 2, 1, 3) + view_54 = None + context_layer_10 = permute_23.contiguous() + permute_23 = None + context_layer_11 = context_layer_10.view((1, 11, -1)) + context_layer_10 = None + hidden_states_24 = torch._C._nn.linear( + context_layer_11, + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_dense_parameters_bias_, + ) + context_layer_11 = l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_25 = torch.nn.functional.dropout( + hidden_states_24, 0.0, False, False + ) + hidden_states_24 = None + add_23 = hidden_states_25 + hidden_states_23 + hidden_states_25 = hidden_states_23 = None + hidden_states_26 = torch.nn.functional.layer_norm( + add_23, + (768,), + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_layer_norm_parameters_bias_, + 1e-07, + ) + add_23 = l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = (None) + hidden_states_27 = torch._C._nn.linear( + hidden_states_26, + l_self_modules_encoder_modules_layer_modules_3_modules_intermediate_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_3_modules_intermediate_modules_dense_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_3_modules_intermediate_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_3_modules_intermediate_modules_dense_parameters_bias_ = (None) + hidden_states_28 = torch._C._nn.gelu(hidden_states_27) + hidden_states_27 = None + hidden_states_29 = torch._C._nn.linear( + hidden_states_28, + l_self_modules_encoder_modules_layer_modules_3_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_3_modules_output_modules_dense_parameters_bias_, + ) + hidden_states_28 = l_self_modules_encoder_modules_layer_modules_3_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_3_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_30 = torch.nn.functional.dropout( + hidden_states_29, 0.0, False, False + ) + hidden_states_29 = None + add_24 = hidden_states_30 + hidden_states_26 + hidden_states_30 = hidden_states_26 = None + hidden_states_31 = torch.nn.functional.layer_norm( + add_24, + (768,), + l_self_modules_encoder_modules_layer_modules_3_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_3_modules_output_modules_layer_norm_parameters_bias_, + 1e-07, + ) + add_24 = l_self_modules_encoder_modules_layer_modules_3_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_3_modules_output_modules_layer_norm_parameters_bias_ = (None) + linear_32 = torch._C._nn.linear( + hidden_states_31, + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_query_proj_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_query_proj_parameters_bias_, + ) + x_20 = linear_32.view((1, 11, 12, -1)) + linear_32 = None + permute_24 = x_20.permute(0, 2, 1, 3) + x_20 = None + contiguous_24 = permute_24.contiguous() + permute_24 = None + query_layer_4 = contiguous_24.view(-1, 11, 64) + contiguous_24 = None + linear_33 = torch._C._nn.linear( + hidden_states_31, + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_key_proj_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_key_proj_parameters_bias_, + ) + x_21 = linear_33.view((1, 11, 12, -1)) + linear_33 = None + permute_25 = x_21.permute(0, 2, 1, 3) + x_21 = None + contiguous_25 = permute_25.contiguous() + permute_25 = None + key_layer_4 = contiguous_25.view(-1, 11, 64) + contiguous_25 = None + linear_34 = torch._C._nn.linear( + hidden_states_31, + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_value_proj_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_value_proj_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_value_proj_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_value_proj_parameters_bias_ = (None) + x_22 = linear_34.view((1, 11, 12, -1)) + linear_34 = None + permute_26 = x_22.permute(0, 2, 1, 3) + x_22 = None + contiguous_26 = permute_26.contiguous() + permute_26 = None + value_layer_4 = contiguous_26.view(-1, 11, 64) + contiguous_26 = None + tensor_14 = torch.tensor(64, dtype=torch.float32) + mul_16 = tensor_14 * 3 + tensor_14 = None + scale_12 = torch.sqrt(mul_16) + mul_16 = None + transpose_16 = key_layer_4.transpose(-1, -2) + to_18 = scale_12.to(dtype=torch.bfloat16) + scale_12 = None + truediv_14 = transpose_16 / to_18 + transpose_16 = to_18 = None + attention_scores_16 = torch.bmm(query_layer_4, truediv_14) + truediv_14 = None + rel_embeddings_9 = torch.nn.functional.dropout( + rel_embeddings, 0.0, False, False + ) + relative_pos_8 = rel_pos_ids_3.unsqueeze(1) + relative_pos_9 = relative_pos_8.to( + device=device(type="cuda", index=0), dtype=torch.int64 + ) + relative_pos_8 = None + getitem_8 = rel_embeddings_9[(slice(0, 512, None), slice(None, None, None))] + rel_embeddings_9 = None + rel_embeddings_10 = getitem_8.unsqueeze(0) + getitem_8 = None + linear_35 = torch._C._nn.linear( + rel_embeddings_10, + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_query_proj_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_query_proj_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_query_proj_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_query_proj_parameters_bias_ = (None) + x_23 = linear_35.view((1, 512, 12, -1)) + linear_35 = None + permute_27 = x_23.permute(0, 2, 1, 3) + x_23 = None + contiguous_27 = permute_27.contiguous() + permute_27 = None + view_63 = contiguous_27.view(-1, 512, 64) + contiguous_27 = None + pos_query_layer_4 = view_63.repeat(1, 1, 1) + view_63 = None + linear_36 = torch._C._nn.linear( + rel_embeddings_10, + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_key_proj_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_key_proj_parameters_bias_, + ) + rel_embeddings_10 = l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_key_proj_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_key_proj_parameters_bias_ = (None) + x_24 = linear_36.view((1, 512, 12, -1)) + linear_36 = None + permute_28 = x_24.permute(0, 2, 1, 3) + x_24 = None + contiguous_28 = permute_28.contiguous() + permute_28 = None + view_65 = contiguous_28.view(-1, 512, 64) + contiguous_28 = None + pos_key_layer_4 = view_65.repeat(1, 1, 1) + view_65 = None + tensor_15 = torch.tensor(64, dtype=torch.float32) + mul_17 = tensor_15 * 3 + tensor_15 = None + scale_13 = torch.sqrt(mul_17) + mul_17 = None + transpose_17 = pos_key_layer_4.transpose(-1, -2) + pos_key_layer_4 = None + c2p_att_8 = torch.bmm(query_layer_4, transpose_17) + query_layer_4 = transpose_17 = None + add_25 = relative_pos_9 + 256 + c2p_pos_4 = torch.clamp(add_25, 0, 511) + add_25 = None + squeeze_9 = c2p_pos_4.squeeze(0) + c2p_pos_4 = None + expand_8 = squeeze_9.expand([12, 11, 11]) + squeeze_9 = None + c2p_att_9 = torch.gather(c2p_att_8, dim=-1, index=expand_8) + c2p_att_8 = expand_8 = None + to_20 = scale_13.to(dtype=torch.bfloat16) + scale_13 = None + truediv_15 = c2p_att_9 / to_20 + c2p_att_9 = to_20 = None + score_8 = 0 + truediv_15 + truediv_15 = None + tensor_16 = torch.tensor(64, dtype=torch.float32) + mul_18 = tensor_16 * 3 + tensor_16 = None + scale_14 = torch.sqrt(mul_18) + mul_18 = None + neg_4 = -relative_pos_9 + relative_pos_9 = None + add_27 = neg_4 + 256 + neg_4 = None + p2c_pos_4 = torch.clamp(add_27, 0, 511) + add_27 = None + transpose_18 = pos_query_layer_4.transpose(-1, -2) + pos_query_layer_4 = None + p2c_att_8 = torch.bmm(key_layer_4, transpose_18) + key_layer_4 = transpose_18 = None + squeeze_10 = p2c_pos_4.squeeze(0) + p2c_pos_4 = None + expand_9 = squeeze_10.expand([12, 11, 11]) + squeeze_10 = None + gather_9 = torch.gather(p2c_att_8, dim=-1, index=expand_9) + p2c_att_8 = expand_9 = None + p2c_att_9 = gather_9.transpose(-1, -2) + gather_9 = None + to_21 = scale_14.to(dtype=torch.bfloat16) + scale_14 = None + truediv_16 = p2c_att_9 / to_21 + p2c_att_9 = to_21 = None + score_8 += truediv_16 + score_9 = score_8 + score_8 = truediv_16 = None + attention_scores_17 = attention_scores_16 + score_9 + attention_scores_16 = score_9 = None + attention_scores_18 = attention_scores_17.view(-1, 12, 11, 11) + attention_scores_17 = None + attention_mask_5 = attention_mask.bool() + invert_4 = ~attention_mask_5 + attention_mask_5 = None + attention_scores_19 = attention_scores_18.masked_fill( + invert_4, -3.3895313892515355e38 + ) + attention_scores_18 = invert_4 = None + attention_probs_8 = torch.nn.functional.softmax(attention_scores_19, dim=-1) + attention_scores_19 = None + attention_probs_9 = torch.nn.functional.dropout( + attention_probs_8, 0.0, False, False + ) + attention_probs_8 = None + view_67 = attention_probs_9.view(-1, 11, 11) + attention_probs_9 = None + context_layer_12 = torch.bmm(view_67, value_layer_4) + view_67 = value_layer_4 = None + view_68 = context_layer_12.view(-1, 12, 11, 64) + context_layer_12 = None + permute_29 = view_68.permute(0, 2, 1, 3) + view_68 = None + context_layer_13 = permute_29.contiguous() + permute_29 = None + context_layer_14 = context_layer_13.view((1, 11, -1)) + context_layer_13 = None + hidden_states_32 = torch._C._nn.linear( + context_layer_14, + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_dense_parameters_bias_, + ) + context_layer_14 = l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_33 = torch.nn.functional.dropout( + hidden_states_32, 0.0, False, False + ) + hidden_states_32 = None + add_29 = hidden_states_33 + hidden_states_31 + hidden_states_33 = hidden_states_31 = None + hidden_states_34 = torch.nn.functional.layer_norm( + add_29, + (768,), + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_layer_norm_parameters_bias_, + 1e-07, + ) + add_29 = l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = (None) + hidden_states_35 = torch._C._nn.linear( + hidden_states_34, + l_self_modules_encoder_modules_layer_modules_4_modules_intermediate_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_4_modules_intermediate_modules_dense_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_4_modules_intermediate_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_4_modules_intermediate_modules_dense_parameters_bias_ = (None) + hidden_states_36 = torch._C._nn.gelu(hidden_states_35) + hidden_states_35 = None + hidden_states_37 = torch._C._nn.linear( + hidden_states_36, + l_self_modules_encoder_modules_layer_modules_4_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_4_modules_output_modules_dense_parameters_bias_, + ) + hidden_states_36 = l_self_modules_encoder_modules_layer_modules_4_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_4_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_38 = torch.nn.functional.dropout( + hidden_states_37, 0.0, False, False + ) + hidden_states_37 = None + add_30 = hidden_states_38 + hidden_states_34 + hidden_states_38 = hidden_states_34 = None + hidden_states_39 = torch.nn.functional.layer_norm( + add_30, + (768,), + l_self_modules_encoder_modules_layer_modules_4_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_4_modules_output_modules_layer_norm_parameters_bias_, + 1e-07, + ) + add_30 = l_self_modules_encoder_modules_layer_modules_4_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_4_modules_output_modules_layer_norm_parameters_bias_ = (None) + linear_40 = torch._C._nn.linear( + hidden_states_39, + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_query_proj_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_query_proj_parameters_bias_, + ) + x_25 = linear_40.view((1, 11, 12, -1)) + linear_40 = None + permute_30 = x_25.permute(0, 2, 1, 3) + x_25 = None + contiguous_30 = permute_30.contiguous() + permute_30 = None + query_layer_5 = contiguous_30.view(-1, 11, 64) + contiguous_30 = None + linear_41 = torch._C._nn.linear( + hidden_states_39, + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_key_proj_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_key_proj_parameters_bias_, + ) + x_26 = linear_41.view((1, 11, 12, -1)) + linear_41 = None + permute_31 = x_26.permute(0, 2, 1, 3) + x_26 = None + contiguous_31 = permute_31.contiguous() + permute_31 = None + key_layer_5 = contiguous_31.view(-1, 11, 64) + contiguous_31 = None + linear_42 = torch._C._nn.linear( + hidden_states_39, + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_value_proj_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_value_proj_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_value_proj_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_value_proj_parameters_bias_ = (None) + x_27 = linear_42.view((1, 11, 12, -1)) + linear_42 = None + permute_32 = x_27.permute(0, 2, 1, 3) + x_27 = None + contiguous_32 = permute_32.contiguous() + permute_32 = None + value_layer_5 = contiguous_32.view(-1, 11, 64) + contiguous_32 = None + tensor_17 = torch.tensor(64, dtype=torch.float32) + mul_19 = tensor_17 * 3 + tensor_17 = None + scale_15 = torch.sqrt(mul_19) + mul_19 = None + transpose_20 = key_layer_5.transpose(-1, -2) + to_22 = scale_15.to(dtype=torch.bfloat16) + scale_15 = None + truediv_17 = transpose_20 / to_22 + transpose_20 = to_22 = None + attention_scores_20 = torch.bmm(query_layer_5, truediv_17) + truediv_17 = None + rel_embeddings_11 = torch.nn.functional.dropout( + rel_embeddings, 0.0, False, False + ) + rel_embeddings = None + relative_pos_10 = rel_pos_ids_3.unsqueeze(1) + rel_pos_ids_3 = None + relative_pos_11 = relative_pos_10.to( + device=device(type="cuda", index=0), dtype=torch.int64 + ) + relative_pos_10 = None + getitem_9 = rel_embeddings_11[(slice(0, 512, None), slice(None, None, None))] + rel_embeddings_11 = None + rel_embeddings_12 = getitem_9.unsqueeze(0) + getitem_9 = None + linear_43 = torch._C._nn.linear( + rel_embeddings_12, + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_query_proj_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_query_proj_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_query_proj_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_query_proj_parameters_bias_ = (None) + x_28 = linear_43.view((1, 512, 12, -1)) + linear_43 = None + permute_33 = x_28.permute(0, 2, 1, 3) + x_28 = None + contiguous_33 = permute_33.contiguous() + permute_33 = None + view_77 = contiguous_33.view(-1, 512, 64) + contiguous_33 = None + pos_query_layer_5 = view_77.repeat(1, 1, 1) + view_77 = None + linear_44 = torch._C._nn.linear( + rel_embeddings_12, + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_key_proj_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_key_proj_parameters_bias_, + ) + rel_embeddings_12 = l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_key_proj_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_key_proj_parameters_bias_ = (None) + x_29 = linear_44.view((1, 512, 12, -1)) + linear_44 = None + permute_34 = x_29.permute(0, 2, 1, 3) + x_29 = None + contiguous_34 = permute_34.contiguous() + permute_34 = None + view_79 = contiguous_34.view(-1, 512, 64) + contiguous_34 = None + pos_key_layer_5 = view_79.repeat(1, 1, 1) + view_79 = None + tensor_18 = torch.tensor(64, dtype=torch.float32) + mul_20 = tensor_18 * 3 + tensor_18 = None + scale_16 = torch.sqrt(mul_20) + mul_20 = None + transpose_21 = pos_key_layer_5.transpose(-1, -2) + pos_key_layer_5 = None + c2p_att_10 = torch.bmm(query_layer_5, transpose_21) + query_layer_5 = transpose_21 = None + add_31 = relative_pos_11 + 256 + c2p_pos_5 = torch.clamp(add_31, 0, 511) + add_31 = None + squeeze_11 = c2p_pos_5.squeeze(0) + c2p_pos_5 = None + expand_10 = squeeze_11.expand([12, 11, 11]) + squeeze_11 = None + c2p_att_11 = torch.gather(c2p_att_10, dim=-1, index=expand_10) + c2p_att_10 = expand_10 = None + to_24 = scale_16.to(dtype=torch.bfloat16) + scale_16 = None + truediv_18 = c2p_att_11 / to_24 + c2p_att_11 = to_24 = None + score_10 = 0 + truediv_18 + truediv_18 = None + tensor_19 = torch.tensor(64, dtype=torch.float32) + mul_21 = tensor_19 * 3 + tensor_19 = None + scale_17 = torch.sqrt(mul_21) + mul_21 = None + neg_5 = -relative_pos_11 + relative_pos_11 = None + add_33 = neg_5 + 256 + neg_5 = None + p2c_pos_5 = torch.clamp(add_33, 0, 511) + add_33 = None + transpose_22 = pos_query_layer_5.transpose(-1, -2) + pos_query_layer_5 = None + p2c_att_10 = torch.bmm(key_layer_5, transpose_22) + key_layer_5 = transpose_22 = None + squeeze_12 = p2c_pos_5.squeeze(0) + p2c_pos_5 = None + expand_11 = squeeze_12.expand([12, 11, 11]) + squeeze_12 = None + gather_11 = torch.gather(p2c_att_10, dim=-1, index=expand_11) + p2c_att_10 = expand_11 = None + p2c_att_11 = gather_11.transpose(-1, -2) + gather_11 = None + to_25 = scale_17.to(dtype=torch.bfloat16) + scale_17 = None + truediv_19 = p2c_att_11 / to_25 + p2c_att_11 = to_25 = None + score_10 += truediv_19 + score_11 = score_10 + score_10 = truediv_19 = None + attention_scores_21 = attention_scores_20 + score_11 + attention_scores_20 = score_11 = None + attention_scores_22 = attention_scores_21.view(-1, 12, 11, 11) + attention_scores_21 = None + attention_mask_6 = attention_mask.bool() + attention_mask = None + invert_5 = ~attention_mask_6 + attention_mask_6 = None + attention_scores_23 = attention_scores_22.masked_fill( + invert_5, -3.3895313892515355e38 + ) + attention_scores_22 = invert_5 = None + attention_probs_10 = torch.nn.functional.softmax(attention_scores_23, dim=-1) + attention_scores_23 = None + attention_probs_11 = torch.nn.functional.dropout( + attention_probs_10, 0.0, False, False + ) + attention_probs_10 = None + view_81 = attention_probs_11.view(-1, 11, 11) + attention_probs_11 = None + context_layer_15 = torch.bmm(view_81, value_layer_5) + view_81 = value_layer_5 = None + view_82 = context_layer_15.view(-1, 12, 11, 64) + context_layer_15 = None + permute_35 = view_82.permute(0, 2, 1, 3) + view_82 = None + context_layer_16 = permute_35.contiguous() + permute_35 = None + context_layer_17 = context_layer_16.view((1, 11, -1)) + context_layer_16 = None + hidden_states_40 = torch._C._nn.linear( + context_layer_17, + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_dense_parameters_bias_, + ) + context_layer_17 = l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_41 = torch.nn.functional.dropout( + hidden_states_40, 0.0, False, False + ) + hidden_states_40 = None + add_35 = hidden_states_41 + hidden_states_39 + hidden_states_41 = hidden_states_39 = None + hidden_states_42 = torch.nn.functional.layer_norm( + add_35, + (768,), + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_layer_norm_parameters_bias_, + 1e-07, + ) + add_35 = l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = (None) + hidden_states_43 = torch._C._nn.linear( + hidden_states_42, + l_self_modules_encoder_modules_layer_modules_5_modules_intermediate_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_5_modules_intermediate_modules_dense_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_5_modules_intermediate_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_5_modules_intermediate_modules_dense_parameters_bias_ = (None) + hidden_states_44 = torch._C._nn.gelu(hidden_states_43) + hidden_states_43 = None + hidden_states_45 = torch._C._nn.linear( + hidden_states_44, + l_self_modules_encoder_modules_layer_modules_5_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_5_modules_output_modules_dense_parameters_bias_, + ) + hidden_states_44 = l_self_modules_encoder_modules_layer_modules_5_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_5_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_46 = torch.nn.functional.dropout( + hidden_states_45, 0.0, False, False + ) + hidden_states_45 = None + add_36 = hidden_states_46 + hidden_states_42 + hidden_states_46 = hidden_states_42 = None + hidden_states_47 = torch.nn.functional.layer_norm( + add_36, + (768,), + l_self_modules_encoder_modules_layer_modules_5_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_5_modules_output_modules_layer_norm_parameters_bias_, + 1e-07, + ) + add_36 = l_self_modules_encoder_modules_layer_modules_5_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_5_modules_output_modules_layer_norm_parameters_bias_ = (None) + return (hidden_states_47,) diff --git a/samples/transformers-auto-model/andersonbcdefg_seo-spam-classifier/weight_meta.py b/samples/transformers-auto-model/andersonbcdefg_seo-spam-classifier/weight_meta.py new file mode 100644 index 000000000..a96db880b --- /dev/null +++ b/samples/transformers-auto-model/andersonbcdefg_seo-spam-classifier/weight_meta.py @@ -0,0 +1,1038 @@ +class Program_weight_tensor_meta_L_input_ids_: + name = "L_input_ids_" + shape = [1, 11] + dtype = "torch.int64" + device = "cuda:0" + mean = None + std = None + data = [1, 26119, 9923, 269, 427, 264, 6317, 15803, 16981, 260, 2] + + +class Program_weight_tensor_meta_L_attention_mask_: + name = "L_attention_mask_" + shape = [1, 11] + dtype = "torch.int64" + device = "cuda:0" + mean = None + std = None + data = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] + + +class Program_weight_tensor_meta_L_self_modules_embeddings_modules_word_embeddings_parameters_weight_: + name = "L_self_modules_embeddings_modules_word_embeddings_parameters_weight_" + shape = [128100, 768] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_embeddings_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_embeddings_modules_LayerNorm_parameters_weight_" + shape = [768] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_embeddings_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_embeddings_modules_LayerNorm_parameters_bias_" + shape = [768] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_rel_embeddings_parameters_weight_: + name = "L_self_modules_encoder_modules_rel_embeddings_parameters_weight_" + shape = [512, 768] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_LayerNorm_parameters_weight_" + shape = [768] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_LayerNorm_parameters_bias_" + shape = [768] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_query_proj_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_query_proj_parameters_weight_" + shape = [768, 768] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_query_proj_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_query_proj_parameters_bias_" + shape = [768] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_key_proj_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_key_proj_parameters_weight_" + shape = [768, 768] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_key_proj_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_key_proj_parameters_bias_" + shape = [768] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_value_proj_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_value_proj_parameters_weight_" + shape = [768, 768] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_value_proj_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_value_proj_parameters_bias_" + shape = [768] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dense_parameters_weight_" + shape = [768, 768] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dense_parameters_bias_" + shape = [768] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_LayerNorm_parameters_weight_" + shape = [768] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_LayerNorm_parameters_bias_" + shape = [768] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_intermediate_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_intermediate_modules_dense_parameters_weight_" + shape = [3072, 768] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_intermediate_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_intermediate_modules_dense_parameters_bias_" + shape = [3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dense_parameters_weight_" + shape = [768, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dense_parameters_bias_" + shape = [768] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_LayerNorm_parameters_weight_" + shape = [768] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_LayerNorm_parameters_bias_" + shape = [768] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_query_proj_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_query_proj_parameters_weight_" + shape = [768, 768] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_query_proj_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_query_proj_parameters_bias_" + shape = [768] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_key_proj_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_key_proj_parameters_weight_" + shape = [768, 768] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_key_proj_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_key_proj_parameters_bias_" + shape = [768] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_value_proj_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_value_proj_parameters_weight_" + shape = [768, 768] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_value_proj_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_value_proj_parameters_bias_" + shape = [768] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dense_parameters_weight_" + shape = [768, 768] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dense_parameters_bias_" + shape = [768] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_LayerNorm_parameters_weight_" + shape = [768] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_LayerNorm_parameters_bias_" + shape = [768] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_intermediate_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_intermediate_modules_dense_parameters_weight_" + shape = [3072, 768] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_intermediate_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_intermediate_modules_dense_parameters_bias_" + shape = [3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dense_parameters_weight_" + shape = [768, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dense_parameters_bias_" + shape = [768] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_LayerNorm_parameters_weight_" + shape = [768] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_LayerNorm_parameters_bias_" + shape = [768] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_query_proj_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_query_proj_parameters_weight_" + shape = [768, 768] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_query_proj_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_query_proj_parameters_bias_" + shape = [768] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_key_proj_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_key_proj_parameters_weight_" + shape = [768, 768] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_key_proj_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_key_proj_parameters_bias_" + shape = [768] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_value_proj_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_value_proj_parameters_weight_" + shape = [768, 768] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_value_proj_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_value_proj_parameters_bias_" + shape = [768] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_dense_parameters_weight_" + shape = [768, 768] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_dense_parameters_bias_" + shape = [768] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_LayerNorm_parameters_weight_" + shape = [768] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_LayerNorm_parameters_bias_" + shape = [768] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_intermediate_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_intermediate_modules_dense_parameters_weight_" + shape = [3072, 768] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_intermediate_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_intermediate_modules_dense_parameters_bias_" + shape = [3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_dense_parameters_weight_" + shape = [768, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_dense_parameters_bias_" + shape = [768] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_LayerNorm_parameters_weight_" + shape = [768] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_LayerNorm_parameters_bias_" + shape = [768] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_query_proj_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_query_proj_parameters_weight_" + shape = [768, 768] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_query_proj_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_query_proj_parameters_bias_" + shape = [768] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_key_proj_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_key_proj_parameters_weight_" + shape = [768, 768] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_key_proj_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_key_proj_parameters_bias_" + shape = [768] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_value_proj_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_value_proj_parameters_weight_" + shape = [768, 768] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_value_proj_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_value_proj_parameters_bias_" + shape = [768] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_dense_parameters_weight_" + shape = [768, 768] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_dense_parameters_bias_" + shape = [768] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_LayerNorm_parameters_weight_" + shape = [768] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_LayerNorm_parameters_bias_" + shape = [768] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_intermediate_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_intermediate_modules_dense_parameters_weight_" + shape = [3072, 768] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_intermediate_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_intermediate_modules_dense_parameters_bias_" + shape = [3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_dense_parameters_weight_" + shape = [768, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_dense_parameters_bias_" + shape = [768] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_LayerNorm_parameters_weight_" + shape = [768] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_LayerNorm_parameters_bias_" + shape = [768] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_query_proj_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_query_proj_parameters_weight_" + shape = [768, 768] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_query_proj_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_query_proj_parameters_bias_" + shape = [768] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_key_proj_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_key_proj_parameters_weight_" + shape = [768, 768] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_key_proj_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_key_proj_parameters_bias_" + shape = [768] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_value_proj_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_value_proj_parameters_weight_" + shape = [768, 768] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_value_proj_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_value_proj_parameters_bias_" + shape = [768] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_dense_parameters_weight_" + shape = [768, 768] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_dense_parameters_bias_" + shape = [768] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_LayerNorm_parameters_weight_" + shape = [768] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_LayerNorm_parameters_bias_" + shape = [768] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_4_modules_intermediate_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_4_modules_intermediate_modules_dense_parameters_weight_" + shape = [3072, 768] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_4_modules_intermediate_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_4_modules_intermediate_modules_dense_parameters_bias_" + shape = [3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_4_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_4_modules_output_modules_dense_parameters_weight_" + shape = [768, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_4_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_4_modules_output_modules_dense_parameters_bias_" + shape = [768] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_4_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_4_modules_output_modules_LayerNorm_parameters_weight_" + shape = [768] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_4_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_4_modules_output_modules_LayerNorm_parameters_bias_" + shape = [768] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_query_proj_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_query_proj_parameters_weight_" + shape = [768, 768] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_query_proj_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_query_proj_parameters_bias_" + shape = [768] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_key_proj_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_key_proj_parameters_weight_" + shape = [768, 768] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_key_proj_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_key_proj_parameters_bias_" + shape = [768] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_value_proj_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_value_proj_parameters_weight_" + shape = [768, 768] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_value_proj_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_value_proj_parameters_bias_" + shape = [768] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_dense_parameters_weight_" + shape = [768, 768] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_dense_parameters_bias_" + shape = [768] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_LayerNorm_parameters_weight_" + shape = [768] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_LayerNorm_parameters_bias_" + shape = [768] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_5_modules_intermediate_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_5_modules_intermediate_modules_dense_parameters_weight_" + shape = [3072, 768] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_5_modules_intermediate_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_5_modules_intermediate_modules_dense_parameters_bias_" + shape = [3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_5_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_5_modules_output_modules_dense_parameters_weight_" + shape = [768, 3072] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_5_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_5_modules_output_modules_dense_parameters_bias_" + shape = [768] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_5_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_5_modules_output_modules_LayerNorm_parameters_weight_" + shape = [768] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_5_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_5_modules_output_modules_LayerNorm_parameters_bias_" + shape = [768] + dtype = "torch.bfloat16" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None diff --git a/samples/transformers-auto-model/ashique_BanglaTraitBERT/graph_hash.txt b/samples/transformers-auto-model/ashique_BanglaTraitBERT/graph_hash.txt new file mode 100644 index 000000000..09c9ca6e4 --- /dev/null +++ b/samples/transformers-auto-model/ashique_BanglaTraitBERT/graph_hash.txt @@ -0,0 +1 @@ +1c52664f970d35b4998fea877f4ebca0f28f570b2cc253f3f1089fef1b16bfcd \ No newline at end of file diff --git a/samples/transformers-auto-model/ashique_BanglaTraitBERT/graph_net.json b/samples/transformers-auto-model/ashique_BanglaTraitBERT/graph_net.json new file mode 100644 index 000000000..b6ffe9f72 --- /dev/null +++ b/samples/transformers-auto-model/ashique_BanglaTraitBERT/graph_net.json @@ -0,0 +1,6 @@ +{ + "framework": "torch", + "num_devices_required": 1, + "num_nodes_required": 1, + "dynamic": false +} \ No newline at end of file diff --git a/samples/transformers-auto-model/ashique_BanglaTraitBERT/input_meta.py b/samples/transformers-auto-model/ashique_BanglaTraitBERT/input_meta.py new file mode 100644 index 000000000..e69de29bb diff --git a/samples/transformers-auto-model/ashique_BanglaTraitBERT/input_tensor_constraints.py b/samples/transformers-auto-model/ashique_BanglaTraitBERT/input_tensor_constraints.py new file mode 100644 index 000000000..e69de29bb diff --git a/samples/transformers-auto-model/ashique_BanglaTraitBERT/model.py b/samples/transformers-auto-model/ashique_BanglaTraitBERT/model.py new file mode 100644 index 000000000..db4be6d2e --- /dev/null +++ b/samples/transformers-auto-model/ashique_BanglaTraitBERT/model.py @@ -0,0 +1,3271 @@ +import torch + + +class GraphModule(torch.nn.Module): + def forward( + self, + L_input_ids_: torch.Tensor, + L_attention_mask_: torch.Tensor, + L_token_type_ids_: torch.Tensor, + L_self_modules_embeddings_modules_word_embeddings_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_embeddings_buffers_position_ids_: torch.Tensor, + L_self_modules_embeddings_modules_token_type_embeddings_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_embeddings_modules_position_embeddings_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_embeddings_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_embeddings_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_query_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_query_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_key_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_key_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_intermediate_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_intermediate_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_query_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_query_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_key_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_key_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_intermediate_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_intermediate_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_query_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_query_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_key_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_key_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_intermediate_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_intermediate_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_query_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_query_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_key_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_key_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_intermediate_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_intermediate_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_query_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_query_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_key_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_key_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_4_modules_intermediate_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_4_modules_intermediate_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_4_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_4_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_4_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_4_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_query_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_query_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_key_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_key_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_5_modules_intermediate_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_5_modules_intermediate_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_5_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_5_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_5_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_5_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_query_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_query_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_key_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_key_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_6_modules_intermediate_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_6_modules_intermediate_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_6_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_6_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_6_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_6_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_query_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_query_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_key_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_key_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_7_modules_intermediate_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_7_modules_intermediate_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_7_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_7_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_7_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_7_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_query_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_query_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_key_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_key_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_8_modules_intermediate_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_8_modules_intermediate_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_8_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_8_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_8_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_8_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_query_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_query_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_key_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_key_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_9_modules_intermediate_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_9_modules_intermediate_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_9_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_9_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_9_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_9_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_query_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_query_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_key_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_key_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_10_modules_intermediate_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_10_modules_intermediate_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_10_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_10_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_10_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_10_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_query_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_query_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_key_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_key_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_11_modules_intermediate_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_11_modules_intermediate_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_11_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_11_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_11_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_11_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_self_modules_query_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_self_modules_query_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_self_modules_key_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_self_modules_key_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_self_modules_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_self_modules_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_12_modules_intermediate_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_12_modules_intermediate_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_12_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_12_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_12_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_12_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_self_modules_query_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_self_modules_query_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_self_modules_key_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_self_modules_key_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_self_modules_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_self_modules_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_13_modules_intermediate_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_13_modules_intermediate_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_13_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_13_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_13_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_13_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_self_modules_query_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_self_modules_query_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_self_modules_key_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_self_modules_key_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_self_modules_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_self_modules_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_14_modules_intermediate_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_14_modules_intermediate_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_14_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_14_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_14_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_14_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_self_modules_query_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_self_modules_query_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_self_modules_key_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_self_modules_key_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_self_modules_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_self_modules_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_15_modules_intermediate_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_15_modules_intermediate_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_15_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_15_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_15_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_15_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_self_modules_query_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_self_modules_query_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_self_modules_key_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_self_modules_key_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_self_modules_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_self_modules_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_16_modules_intermediate_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_16_modules_intermediate_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_16_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_16_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_16_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_16_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_self_modules_query_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_self_modules_query_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_self_modules_key_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_self_modules_key_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_self_modules_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_self_modules_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_17_modules_intermediate_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_17_modules_intermediate_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_17_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_17_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_17_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_17_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_self_modules_query_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_self_modules_query_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_self_modules_key_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_self_modules_key_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_self_modules_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_self_modules_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_18_modules_intermediate_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_18_modules_intermediate_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_18_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_18_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_18_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_18_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_self_modules_query_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_self_modules_query_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_self_modules_key_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_self_modules_key_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_self_modules_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_self_modules_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_19_modules_intermediate_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_19_modules_intermediate_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_19_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_19_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_19_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_19_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_self_modules_query_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_self_modules_query_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_self_modules_key_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_self_modules_key_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_self_modules_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_self_modules_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_20_modules_intermediate_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_20_modules_intermediate_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_20_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_20_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_20_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_20_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_self_modules_query_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_self_modules_query_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_self_modules_key_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_self_modules_key_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_self_modules_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_self_modules_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_21_modules_intermediate_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_21_modules_intermediate_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_21_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_21_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_21_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_21_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_self_modules_query_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_self_modules_query_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_self_modules_key_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_self_modules_key_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_self_modules_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_self_modules_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_22_modules_intermediate_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_22_modules_intermediate_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_22_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_22_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_22_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_22_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_self_modules_query_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_self_modules_query_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_self_modules_key_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_self_modules_key_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_self_modules_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_self_modules_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_23_modules_intermediate_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_23_modules_intermediate_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_23_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_23_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_23_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_23_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + ): + l_input_ids_ = L_input_ids_ + l_attention_mask_ = L_attention_mask_ + l_token_type_ids_ = L_token_type_ids_ + l_self_modules_embeddings_modules_word_embeddings_parameters_weight_ = ( + L_self_modules_embeddings_modules_word_embeddings_parameters_weight_ + ) + l_self_modules_embeddings_buffers_position_ids_ = ( + L_self_modules_embeddings_buffers_position_ids_ + ) + l_self_modules_embeddings_modules_token_type_embeddings_parameters_weight_ = ( + L_self_modules_embeddings_modules_token_type_embeddings_parameters_weight_ + ) + l_self_modules_embeddings_modules_position_embeddings_parameters_weight_ = ( + L_self_modules_embeddings_modules_position_embeddings_parameters_weight_ + ) + l_self_modules_embeddings_modules_layer_norm_parameters_weight_ = ( + L_self_modules_embeddings_modules_LayerNorm_parameters_weight_ + ) + l_self_modules_embeddings_modules_layer_norm_parameters_bias_ = ( + L_self_modules_embeddings_modules_LayerNorm_parameters_bias_ + ) + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_query_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_query_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_query_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_query_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_key_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_key_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_key_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_key_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_value_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_value_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_value_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_value_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_0_modules_intermediate_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_0_modules_intermediate_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_0_modules_intermediate_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_0_modules_intermediate_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_0_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_0_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_query_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_query_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_query_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_query_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_key_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_key_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_key_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_key_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_value_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_value_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_value_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_value_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_1_modules_intermediate_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_1_modules_intermediate_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_1_modules_intermediate_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_1_modules_intermediate_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_1_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_1_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_query_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_query_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_query_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_query_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_key_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_key_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_key_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_key_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_value_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_value_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_value_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_value_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_2_modules_intermediate_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_2_modules_intermediate_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_2_modules_intermediate_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_2_modules_intermediate_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_2_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_2_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_2_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_2_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_query_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_query_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_query_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_query_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_key_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_key_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_key_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_key_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_value_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_value_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_value_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_value_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_3_modules_intermediate_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_3_modules_intermediate_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_3_modules_intermediate_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_3_modules_intermediate_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_3_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_3_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_3_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_3_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_query_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_query_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_query_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_query_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_key_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_key_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_key_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_key_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_value_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_value_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_value_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_value_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_4_modules_intermediate_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_4_modules_intermediate_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_4_modules_intermediate_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_4_modules_intermediate_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_4_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_4_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_4_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_4_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_4_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_4_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_4_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_4_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_query_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_query_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_query_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_query_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_key_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_key_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_key_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_key_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_value_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_value_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_value_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_value_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_5_modules_intermediate_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_5_modules_intermediate_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_5_modules_intermediate_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_5_modules_intermediate_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_5_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_5_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_5_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_5_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_5_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_5_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_5_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_5_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_query_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_query_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_query_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_query_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_key_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_key_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_key_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_key_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_value_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_value_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_value_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_value_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_6_modules_intermediate_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_6_modules_intermediate_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_6_modules_intermediate_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_6_modules_intermediate_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_6_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_6_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_6_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_6_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_6_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_6_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_6_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_6_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_query_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_query_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_query_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_query_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_key_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_key_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_key_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_key_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_value_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_value_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_value_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_value_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_7_modules_intermediate_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_7_modules_intermediate_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_7_modules_intermediate_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_7_modules_intermediate_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_7_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_7_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_7_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_7_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_7_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_7_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_7_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_7_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_query_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_query_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_query_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_query_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_key_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_key_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_key_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_key_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_value_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_value_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_value_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_value_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_8_modules_intermediate_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_8_modules_intermediate_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_8_modules_intermediate_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_8_modules_intermediate_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_8_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_8_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_8_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_8_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_8_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_8_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_8_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_8_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_query_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_query_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_query_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_query_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_key_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_key_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_key_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_key_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_value_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_value_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_value_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_value_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_9_modules_intermediate_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_9_modules_intermediate_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_9_modules_intermediate_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_9_modules_intermediate_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_9_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_9_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_9_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_9_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_9_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_9_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_9_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_9_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_query_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_query_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_query_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_query_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_key_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_key_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_key_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_key_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_value_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_value_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_value_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_value_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_10_modules_intermediate_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_10_modules_intermediate_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_10_modules_intermediate_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_10_modules_intermediate_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_10_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_10_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_10_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_10_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_10_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_10_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_10_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_10_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_query_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_query_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_query_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_query_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_key_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_key_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_key_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_key_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_value_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_value_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_value_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_value_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_11_modules_intermediate_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_11_modules_intermediate_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_11_modules_intermediate_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_11_modules_intermediate_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_11_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_11_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_11_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_11_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_11_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_11_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_11_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_11_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_self_modules_query_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_self_modules_query_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_self_modules_query_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_self_modules_query_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_self_modules_key_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_self_modules_key_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_self_modules_key_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_self_modules_key_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_self_modules_value_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_self_modules_value_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_self_modules_value_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_self_modules_value_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_12_modules_intermediate_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_12_modules_intermediate_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_12_modules_intermediate_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_12_modules_intermediate_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_12_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_12_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_12_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_12_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_12_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_12_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_12_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_12_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_self_modules_query_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_self_modules_query_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_self_modules_query_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_self_modules_query_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_self_modules_key_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_self_modules_key_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_self_modules_key_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_self_modules_key_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_self_modules_value_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_self_modules_value_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_self_modules_value_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_self_modules_value_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_13_modules_intermediate_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_13_modules_intermediate_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_13_modules_intermediate_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_13_modules_intermediate_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_13_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_13_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_13_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_13_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_13_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_13_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_13_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_13_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_self_modules_query_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_self_modules_query_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_self_modules_query_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_self_modules_query_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_self_modules_key_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_self_modules_key_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_self_modules_key_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_self_modules_key_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_self_modules_value_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_self_modules_value_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_self_modules_value_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_self_modules_value_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_14_modules_intermediate_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_14_modules_intermediate_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_14_modules_intermediate_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_14_modules_intermediate_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_14_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_14_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_14_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_14_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_14_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_14_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_14_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_14_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_self_modules_query_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_self_modules_query_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_self_modules_query_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_self_modules_query_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_self_modules_key_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_self_modules_key_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_self_modules_key_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_self_modules_key_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_self_modules_value_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_self_modules_value_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_self_modules_value_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_self_modules_value_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_15_modules_intermediate_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_15_modules_intermediate_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_15_modules_intermediate_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_15_modules_intermediate_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_15_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_15_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_15_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_15_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_15_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_15_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_15_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_15_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_self_modules_query_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_self_modules_query_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_self_modules_query_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_self_modules_query_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_self_modules_key_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_self_modules_key_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_self_modules_key_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_self_modules_key_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_self_modules_value_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_self_modules_value_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_self_modules_value_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_self_modules_value_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_16_modules_intermediate_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_16_modules_intermediate_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_16_modules_intermediate_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_16_modules_intermediate_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_16_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_16_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_16_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_16_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_16_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_16_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_16_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_16_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_self_modules_query_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_self_modules_query_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_self_modules_query_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_self_modules_query_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_self_modules_key_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_self_modules_key_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_self_modules_key_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_self_modules_key_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_self_modules_value_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_self_modules_value_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_self_modules_value_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_self_modules_value_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_17_modules_intermediate_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_17_modules_intermediate_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_17_modules_intermediate_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_17_modules_intermediate_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_17_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_17_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_17_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_17_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_17_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_17_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_17_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_17_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_self_modules_query_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_self_modules_query_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_self_modules_query_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_self_modules_query_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_self_modules_key_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_self_modules_key_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_self_modules_key_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_self_modules_key_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_self_modules_value_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_self_modules_value_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_self_modules_value_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_self_modules_value_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_18_modules_intermediate_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_18_modules_intermediate_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_18_modules_intermediate_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_18_modules_intermediate_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_18_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_18_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_18_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_18_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_18_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_18_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_18_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_18_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_self_modules_query_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_self_modules_query_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_self_modules_query_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_self_modules_query_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_self_modules_key_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_self_modules_key_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_self_modules_key_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_self_modules_key_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_self_modules_value_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_self_modules_value_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_self_modules_value_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_self_modules_value_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_19_modules_intermediate_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_19_modules_intermediate_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_19_modules_intermediate_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_19_modules_intermediate_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_19_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_19_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_19_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_19_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_19_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_19_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_19_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_19_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_self_modules_query_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_self_modules_query_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_self_modules_query_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_self_modules_query_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_self_modules_key_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_self_modules_key_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_self_modules_key_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_self_modules_key_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_self_modules_value_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_self_modules_value_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_self_modules_value_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_self_modules_value_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_20_modules_intermediate_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_20_modules_intermediate_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_20_modules_intermediate_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_20_modules_intermediate_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_20_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_20_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_20_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_20_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_20_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_20_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_20_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_20_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_self_modules_query_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_self_modules_query_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_self_modules_query_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_self_modules_query_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_self_modules_key_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_self_modules_key_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_self_modules_key_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_self_modules_key_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_self_modules_value_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_self_modules_value_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_self_modules_value_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_self_modules_value_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_21_modules_intermediate_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_21_modules_intermediate_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_21_modules_intermediate_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_21_modules_intermediate_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_21_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_21_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_21_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_21_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_21_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_21_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_21_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_21_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_self_modules_query_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_self_modules_query_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_self_modules_query_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_self_modules_query_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_self_modules_key_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_self_modules_key_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_self_modules_key_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_self_modules_key_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_self_modules_value_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_self_modules_value_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_self_modules_value_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_self_modules_value_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_22_modules_intermediate_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_22_modules_intermediate_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_22_modules_intermediate_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_22_modules_intermediate_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_22_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_22_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_22_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_22_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_22_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_22_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_22_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_22_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_self_modules_query_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_self_modules_query_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_self_modules_query_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_self_modules_query_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_self_modules_key_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_self_modules_key_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_self_modules_key_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_self_modules_key_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_self_modules_value_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_self_modules_value_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_self_modules_value_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_self_modules_value_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_23_modules_intermediate_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_23_modules_intermediate_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_23_modules_intermediate_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_23_modules_intermediate_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_23_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_23_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_23_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_23_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_23_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_23_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_23_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_23_modules_output_modules_LayerNorm_parameters_bias_ + extended_attention_mask = l_attention_mask_[ + (slice(None, None, None), None, None, slice(None, None, None)) + ] + l_attention_mask_ = None + extended_attention_mask_1 = extended_attention_mask.to(dtype=torch.float32) + extended_attention_mask = None + sub = 1.0 - extended_attention_mask_1 + extended_attention_mask_1 = None + extended_attention_mask_2 = sub * -3.4028234663852886e38 + sub = None + position_ids = l_self_modules_embeddings_buffers_position_ids_[ + (slice(None, None, None), slice(0, 21, None)) + ] + l_self_modules_embeddings_buffers_position_ids_ = None + inputs_embeds = torch.nn.functional.embedding( + l_input_ids_, + l_self_modules_embeddings_modules_word_embeddings_parameters_weight_, + 0, + None, + 2.0, + False, + False, + ) + l_input_ids_ = ( + l_self_modules_embeddings_modules_word_embeddings_parameters_weight_ + ) = None + token_type_embeddings = torch.nn.functional.embedding( + l_token_type_ids_, + l_self_modules_embeddings_modules_token_type_embeddings_parameters_weight_, + None, + None, + 2.0, + False, + False, + ) + l_token_type_ids_ = ( + l_self_modules_embeddings_modules_token_type_embeddings_parameters_weight_ + ) = None + embeddings = inputs_embeds + token_type_embeddings + inputs_embeds = token_type_embeddings = None + position_embeddings = torch.nn.functional.embedding( + position_ids, + l_self_modules_embeddings_modules_position_embeddings_parameters_weight_, + None, + None, + 2.0, + False, + False, + ) + position_ids = ( + l_self_modules_embeddings_modules_position_embeddings_parameters_weight_ + ) = None + embeddings += position_embeddings + embeddings_1 = embeddings + embeddings = position_embeddings = None + embeddings_2 = torch.nn.functional.layer_norm( + embeddings_1, + (1024,), + l_self_modules_embeddings_modules_layer_norm_parameters_weight_, + l_self_modules_embeddings_modules_layer_norm_parameters_bias_, + 1e-12, + ) + embeddings_1 = ( + l_self_modules_embeddings_modules_layer_norm_parameters_weight_ + ) = l_self_modules_embeddings_modules_layer_norm_parameters_bias_ = None + embeddings_3 = torch.nn.functional.dropout(embeddings_2, 0.1, False, False) + embeddings_2 = None + query_layer = torch._C._nn.linear( + embeddings_3, + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_query_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_query_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_query_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_query_parameters_bias_ = (None) + view = query_layer.view(1, -1, 16, 64) + query_layer = None + query_layer_1 = view.transpose(1, 2) + view = None + key_layer = torch._C._nn.linear( + embeddings_3, + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_key_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_key_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_key_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_key_parameters_bias_ = (None) + view_1 = key_layer.view(1, -1, 16, 64) + key_layer = None + key_layer_1 = view_1.transpose(1, 2) + view_1 = None + value_layer = torch._C._nn.linear( + embeddings_3, + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_value_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_value_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_value_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_value_parameters_bias_ = (None) + view_2 = value_layer.view(1, -1, 16, 64) + value_layer = None + value_layer_1 = view_2.transpose(1, 2) + view_2 = None + transpose_3 = key_layer_1.transpose(-1, -2) + key_layer_1 = None + attention_scores = torch.matmul(query_layer_1, transpose_3) + query_layer_1 = transpose_3 = None + attention_scores_1 = attention_scores / 8.0 + attention_scores = None + attention_scores_2 = attention_scores_1 + extended_attention_mask_2 + attention_scores_1 = None + attention_probs = torch.nn.functional.softmax(attention_scores_2, dim=-1) + attention_scores_2 = None + attention_probs_1 = torch.nn.functional.dropout( + attention_probs, 0.1, False, False + ) + attention_probs = None + context_layer = torch.matmul(attention_probs_1, value_layer_1) + attention_probs_1 = value_layer_1 = None + permute = context_layer.permute(0, 2, 1, 3) + context_layer = None + context_layer_1 = permute.contiguous() + permute = None + context_layer_2 = context_layer_1.view((1, 21, 1024)) + context_layer_1 = None + hidden_states = torch._C._nn.linear( + context_layer_2, + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dense_parameters_bias_, + ) + context_layer_2 = l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_1 = torch.nn.functional.dropout(hidden_states, 0.1, False, False) + hidden_states = None + add_2 = hidden_states_1 + embeddings_3 + hidden_states_1 = embeddings_3 = None + hidden_states_2 = torch.nn.functional.layer_norm( + add_2, + (1024,), + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_layer_norm_parameters_bias_, + 1e-12, + ) + add_2 = l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = (None) + hidden_states_3 = torch._C._nn.linear( + hidden_states_2, + l_self_modules_encoder_modules_layer_modules_0_modules_intermediate_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_0_modules_intermediate_modules_dense_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_0_modules_intermediate_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_0_modules_intermediate_modules_dense_parameters_bias_ = (None) + hidden_states_4 = torch._C._nn.gelu(hidden_states_3) + hidden_states_3 = None + hidden_states_5 = torch._C._nn.linear( + hidden_states_4, + l_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dense_parameters_bias_, + ) + hidden_states_4 = l_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_6 = torch.nn.functional.dropout( + hidden_states_5, 0.1, False, False + ) + hidden_states_5 = None + add_3 = hidden_states_6 + hidden_states_2 + hidden_states_6 = hidden_states_2 = None + hidden_states_7 = torch.nn.functional.layer_norm( + add_3, + (1024,), + l_self_modules_encoder_modules_layer_modules_0_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_0_modules_output_modules_layer_norm_parameters_bias_, + 1e-12, + ) + add_3 = l_self_modules_encoder_modules_layer_modules_0_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_0_modules_output_modules_layer_norm_parameters_bias_ = (None) + query_layer_2 = torch._C._nn.linear( + hidden_states_7, + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_query_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_query_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_query_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_query_parameters_bias_ = (None) + view_4 = query_layer_2.view(1, -1, 16, 64) + query_layer_2 = None + query_layer_3 = view_4.transpose(1, 2) + view_4 = None + key_layer_2 = torch._C._nn.linear( + hidden_states_7, + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_key_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_key_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_key_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_key_parameters_bias_ = (None) + view_5 = key_layer_2.view(1, -1, 16, 64) + key_layer_2 = None + key_layer_3 = view_5.transpose(1, 2) + view_5 = None + value_layer_2 = torch._C._nn.linear( + hidden_states_7, + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_value_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_value_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_value_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_value_parameters_bias_ = (None) + view_6 = value_layer_2.view(1, -1, 16, 64) + value_layer_2 = None + value_layer_3 = view_6.transpose(1, 2) + view_6 = None + transpose_7 = key_layer_3.transpose(-1, -2) + key_layer_3 = None + attention_scores_3 = torch.matmul(query_layer_3, transpose_7) + query_layer_3 = transpose_7 = None + attention_scores_4 = attention_scores_3 / 8.0 + attention_scores_3 = None + attention_scores_5 = attention_scores_4 + extended_attention_mask_2 + attention_scores_4 = None + attention_probs_2 = torch.nn.functional.softmax(attention_scores_5, dim=-1) + attention_scores_5 = None + attention_probs_3 = torch.nn.functional.dropout( + attention_probs_2, 0.1, False, False + ) + attention_probs_2 = None + context_layer_3 = torch.matmul(attention_probs_3, value_layer_3) + attention_probs_3 = value_layer_3 = None + permute_1 = context_layer_3.permute(0, 2, 1, 3) + context_layer_3 = None + context_layer_4 = permute_1.contiguous() + permute_1 = None + context_layer_5 = context_layer_4.view((1, 21, 1024)) + context_layer_4 = None + hidden_states_8 = torch._C._nn.linear( + context_layer_5, + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dense_parameters_bias_, + ) + context_layer_5 = l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_9 = torch.nn.functional.dropout( + hidden_states_8, 0.1, False, False + ) + hidden_states_8 = None + add_5 = hidden_states_9 + hidden_states_7 + hidden_states_9 = hidden_states_7 = None + hidden_states_10 = torch.nn.functional.layer_norm( + add_5, + (1024,), + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_layer_norm_parameters_bias_, + 1e-12, + ) + add_5 = l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = (None) + hidden_states_11 = torch._C._nn.linear( + hidden_states_10, + l_self_modules_encoder_modules_layer_modules_1_modules_intermediate_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_1_modules_intermediate_modules_dense_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_1_modules_intermediate_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_1_modules_intermediate_modules_dense_parameters_bias_ = (None) + hidden_states_12 = torch._C._nn.gelu(hidden_states_11) + hidden_states_11 = None + hidden_states_13 = torch._C._nn.linear( + hidden_states_12, + l_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dense_parameters_bias_, + ) + hidden_states_12 = l_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_14 = torch.nn.functional.dropout( + hidden_states_13, 0.1, False, False + ) + hidden_states_13 = None + add_6 = hidden_states_14 + hidden_states_10 + hidden_states_14 = hidden_states_10 = None + hidden_states_15 = torch.nn.functional.layer_norm( + add_6, + (1024,), + l_self_modules_encoder_modules_layer_modules_1_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_1_modules_output_modules_layer_norm_parameters_bias_, + 1e-12, + ) + add_6 = l_self_modules_encoder_modules_layer_modules_1_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_1_modules_output_modules_layer_norm_parameters_bias_ = (None) + query_layer_4 = torch._C._nn.linear( + hidden_states_15, + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_query_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_query_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_query_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_query_parameters_bias_ = (None) + view_8 = query_layer_4.view(1, -1, 16, 64) + query_layer_4 = None + query_layer_5 = view_8.transpose(1, 2) + view_8 = None + key_layer_4 = torch._C._nn.linear( + hidden_states_15, + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_key_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_key_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_key_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_key_parameters_bias_ = (None) + view_9 = key_layer_4.view(1, -1, 16, 64) + key_layer_4 = None + key_layer_5 = view_9.transpose(1, 2) + view_9 = None + value_layer_4 = torch._C._nn.linear( + hidden_states_15, + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_value_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_value_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_value_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_value_parameters_bias_ = (None) + view_10 = value_layer_4.view(1, -1, 16, 64) + value_layer_4 = None + value_layer_5 = view_10.transpose(1, 2) + view_10 = None + transpose_11 = key_layer_5.transpose(-1, -2) + key_layer_5 = None + attention_scores_6 = torch.matmul(query_layer_5, transpose_11) + query_layer_5 = transpose_11 = None + attention_scores_7 = attention_scores_6 / 8.0 + attention_scores_6 = None + attention_scores_8 = attention_scores_7 + extended_attention_mask_2 + attention_scores_7 = None + attention_probs_4 = torch.nn.functional.softmax(attention_scores_8, dim=-1) + attention_scores_8 = None + attention_probs_5 = torch.nn.functional.dropout( + attention_probs_4, 0.1, False, False + ) + attention_probs_4 = None + context_layer_6 = torch.matmul(attention_probs_5, value_layer_5) + attention_probs_5 = value_layer_5 = None + permute_2 = context_layer_6.permute(0, 2, 1, 3) + context_layer_6 = None + context_layer_7 = permute_2.contiguous() + permute_2 = None + context_layer_8 = context_layer_7.view((1, 21, 1024)) + context_layer_7 = None + hidden_states_16 = torch._C._nn.linear( + context_layer_8, + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_dense_parameters_bias_, + ) + context_layer_8 = l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_17 = torch.nn.functional.dropout( + hidden_states_16, 0.1, False, False + ) + hidden_states_16 = None + add_8 = hidden_states_17 + hidden_states_15 + hidden_states_17 = hidden_states_15 = None + hidden_states_18 = torch.nn.functional.layer_norm( + add_8, + (1024,), + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_layer_norm_parameters_bias_, + 1e-12, + ) + add_8 = l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = (None) + hidden_states_19 = torch._C._nn.linear( + hidden_states_18, + l_self_modules_encoder_modules_layer_modules_2_modules_intermediate_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_2_modules_intermediate_modules_dense_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_2_modules_intermediate_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_2_modules_intermediate_modules_dense_parameters_bias_ = (None) + hidden_states_20 = torch._C._nn.gelu(hidden_states_19) + hidden_states_19 = None + hidden_states_21 = torch._C._nn.linear( + hidden_states_20, + l_self_modules_encoder_modules_layer_modules_2_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_2_modules_output_modules_dense_parameters_bias_, + ) + hidden_states_20 = l_self_modules_encoder_modules_layer_modules_2_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_2_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_22 = torch.nn.functional.dropout( + hidden_states_21, 0.1, False, False + ) + hidden_states_21 = None + add_9 = hidden_states_22 + hidden_states_18 + hidden_states_22 = hidden_states_18 = None + hidden_states_23 = torch.nn.functional.layer_norm( + add_9, + (1024,), + l_self_modules_encoder_modules_layer_modules_2_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_2_modules_output_modules_layer_norm_parameters_bias_, + 1e-12, + ) + add_9 = l_self_modules_encoder_modules_layer_modules_2_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_2_modules_output_modules_layer_norm_parameters_bias_ = (None) + query_layer_6 = torch._C._nn.linear( + hidden_states_23, + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_query_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_query_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_query_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_query_parameters_bias_ = (None) + view_12 = query_layer_6.view(1, -1, 16, 64) + query_layer_6 = None + query_layer_7 = view_12.transpose(1, 2) + view_12 = None + key_layer_6 = torch._C._nn.linear( + hidden_states_23, + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_key_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_key_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_key_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_key_parameters_bias_ = (None) + view_13 = key_layer_6.view(1, -1, 16, 64) + key_layer_6 = None + key_layer_7 = view_13.transpose(1, 2) + view_13 = None + value_layer_6 = torch._C._nn.linear( + hidden_states_23, + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_value_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_value_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_value_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_value_parameters_bias_ = (None) + view_14 = value_layer_6.view(1, -1, 16, 64) + value_layer_6 = None + value_layer_7 = view_14.transpose(1, 2) + view_14 = None + transpose_15 = key_layer_7.transpose(-1, -2) + key_layer_7 = None + attention_scores_9 = torch.matmul(query_layer_7, transpose_15) + query_layer_7 = transpose_15 = None + attention_scores_10 = attention_scores_9 / 8.0 + attention_scores_9 = None + attention_scores_11 = attention_scores_10 + extended_attention_mask_2 + attention_scores_10 = None + attention_probs_6 = torch.nn.functional.softmax(attention_scores_11, dim=-1) + attention_scores_11 = None + attention_probs_7 = torch.nn.functional.dropout( + attention_probs_6, 0.1, False, False + ) + attention_probs_6 = None + context_layer_9 = torch.matmul(attention_probs_7, value_layer_7) + attention_probs_7 = value_layer_7 = None + permute_3 = context_layer_9.permute(0, 2, 1, 3) + context_layer_9 = None + context_layer_10 = permute_3.contiguous() + permute_3 = None + context_layer_11 = context_layer_10.view((1, 21, 1024)) + context_layer_10 = None + hidden_states_24 = torch._C._nn.linear( + context_layer_11, + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_dense_parameters_bias_, + ) + context_layer_11 = l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_25 = torch.nn.functional.dropout( + hidden_states_24, 0.1, False, False + ) + hidden_states_24 = None + add_11 = hidden_states_25 + hidden_states_23 + hidden_states_25 = hidden_states_23 = None + hidden_states_26 = torch.nn.functional.layer_norm( + add_11, + (1024,), + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_layer_norm_parameters_bias_, + 1e-12, + ) + add_11 = l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = (None) + hidden_states_27 = torch._C._nn.linear( + hidden_states_26, + l_self_modules_encoder_modules_layer_modules_3_modules_intermediate_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_3_modules_intermediate_modules_dense_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_3_modules_intermediate_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_3_modules_intermediate_modules_dense_parameters_bias_ = (None) + hidden_states_28 = torch._C._nn.gelu(hidden_states_27) + hidden_states_27 = None + hidden_states_29 = torch._C._nn.linear( + hidden_states_28, + l_self_modules_encoder_modules_layer_modules_3_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_3_modules_output_modules_dense_parameters_bias_, + ) + hidden_states_28 = l_self_modules_encoder_modules_layer_modules_3_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_3_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_30 = torch.nn.functional.dropout( + hidden_states_29, 0.1, False, False + ) + hidden_states_29 = None + add_12 = hidden_states_30 + hidden_states_26 + hidden_states_30 = hidden_states_26 = None + hidden_states_31 = torch.nn.functional.layer_norm( + add_12, + (1024,), + l_self_modules_encoder_modules_layer_modules_3_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_3_modules_output_modules_layer_norm_parameters_bias_, + 1e-12, + ) + add_12 = l_self_modules_encoder_modules_layer_modules_3_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_3_modules_output_modules_layer_norm_parameters_bias_ = (None) + query_layer_8 = torch._C._nn.linear( + hidden_states_31, + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_query_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_query_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_query_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_query_parameters_bias_ = (None) + view_16 = query_layer_8.view(1, -1, 16, 64) + query_layer_8 = None + query_layer_9 = view_16.transpose(1, 2) + view_16 = None + key_layer_8 = torch._C._nn.linear( + hidden_states_31, + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_key_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_key_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_key_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_key_parameters_bias_ = (None) + view_17 = key_layer_8.view(1, -1, 16, 64) + key_layer_8 = None + key_layer_9 = view_17.transpose(1, 2) + view_17 = None + value_layer_8 = torch._C._nn.linear( + hidden_states_31, + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_value_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_value_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_value_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_value_parameters_bias_ = (None) + view_18 = value_layer_8.view(1, -1, 16, 64) + value_layer_8 = None + value_layer_9 = view_18.transpose(1, 2) + view_18 = None + transpose_19 = key_layer_9.transpose(-1, -2) + key_layer_9 = None + attention_scores_12 = torch.matmul(query_layer_9, transpose_19) + query_layer_9 = transpose_19 = None + attention_scores_13 = attention_scores_12 / 8.0 + attention_scores_12 = None + attention_scores_14 = attention_scores_13 + extended_attention_mask_2 + attention_scores_13 = None + attention_probs_8 = torch.nn.functional.softmax(attention_scores_14, dim=-1) + attention_scores_14 = None + attention_probs_9 = torch.nn.functional.dropout( + attention_probs_8, 0.1, False, False + ) + attention_probs_8 = None + context_layer_12 = torch.matmul(attention_probs_9, value_layer_9) + attention_probs_9 = value_layer_9 = None + permute_4 = context_layer_12.permute(0, 2, 1, 3) + context_layer_12 = None + context_layer_13 = permute_4.contiguous() + permute_4 = None + context_layer_14 = context_layer_13.view((1, 21, 1024)) + context_layer_13 = None + hidden_states_32 = torch._C._nn.linear( + context_layer_14, + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_dense_parameters_bias_, + ) + context_layer_14 = l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_33 = torch.nn.functional.dropout( + hidden_states_32, 0.1, False, False + ) + hidden_states_32 = None + add_14 = hidden_states_33 + hidden_states_31 + hidden_states_33 = hidden_states_31 = None + hidden_states_34 = torch.nn.functional.layer_norm( + add_14, + (1024,), + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_layer_norm_parameters_bias_, + 1e-12, + ) + add_14 = l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = (None) + hidden_states_35 = torch._C._nn.linear( + hidden_states_34, + l_self_modules_encoder_modules_layer_modules_4_modules_intermediate_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_4_modules_intermediate_modules_dense_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_4_modules_intermediate_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_4_modules_intermediate_modules_dense_parameters_bias_ = (None) + hidden_states_36 = torch._C._nn.gelu(hidden_states_35) + hidden_states_35 = None + hidden_states_37 = torch._C._nn.linear( + hidden_states_36, + l_self_modules_encoder_modules_layer_modules_4_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_4_modules_output_modules_dense_parameters_bias_, + ) + hidden_states_36 = l_self_modules_encoder_modules_layer_modules_4_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_4_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_38 = torch.nn.functional.dropout( + hidden_states_37, 0.1, False, False + ) + hidden_states_37 = None + add_15 = hidden_states_38 + hidden_states_34 + hidden_states_38 = hidden_states_34 = None + hidden_states_39 = torch.nn.functional.layer_norm( + add_15, + (1024,), + l_self_modules_encoder_modules_layer_modules_4_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_4_modules_output_modules_layer_norm_parameters_bias_, + 1e-12, + ) + add_15 = l_self_modules_encoder_modules_layer_modules_4_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_4_modules_output_modules_layer_norm_parameters_bias_ = (None) + query_layer_10 = torch._C._nn.linear( + hidden_states_39, + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_query_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_query_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_query_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_query_parameters_bias_ = (None) + view_20 = query_layer_10.view(1, -1, 16, 64) + query_layer_10 = None + query_layer_11 = view_20.transpose(1, 2) + view_20 = None + key_layer_10 = torch._C._nn.linear( + hidden_states_39, + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_key_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_key_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_key_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_key_parameters_bias_ = (None) + view_21 = key_layer_10.view(1, -1, 16, 64) + key_layer_10 = None + key_layer_11 = view_21.transpose(1, 2) + view_21 = None + value_layer_10 = torch._C._nn.linear( + hidden_states_39, + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_value_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_value_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_value_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_value_parameters_bias_ = (None) + view_22 = value_layer_10.view(1, -1, 16, 64) + value_layer_10 = None + value_layer_11 = view_22.transpose(1, 2) + view_22 = None + transpose_23 = key_layer_11.transpose(-1, -2) + key_layer_11 = None + attention_scores_15 = torch.matmul(query_layer_11, transpose_23) + query_layer_11 = transpose_23 = None + attention_scores_16 = attention_scores_15 / 8.0 + attention_scores_15 = None + attention_scores_17 = attention_scores_16 + extended_attention_mask_2 + attention_scores_16 = None + attention_probs_10 = torch.nn.functional.softmax(attention_scores_17, dim=-1) + attention_scores_17 = None + attention_probs_11 = torch.nn.functional.dropout( + attention_probs_10, 0.1, False, False + ) + attention_probs_10 = None + context_layer_15 = torch.matmul(attention_probs_11, value_layer_11) + attention_probs_11 = value_layer_11 = None + permute_5 = context_layer_15.permute(0, 2, 1, 3) + context_layer_15 = None + context_layer_16 = permute_5.contiguous() + permute_5 = None + context_layer_17 = context_layer_16.view((1, 21, 1024)) + context_layer_16 = None + hidden_states_40 = torch._C._nn.linear( + context_layer_17, + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_dense_parameters_bias_, + ) + context_layer_17 = l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_41 = torch.nn.functional.dropout( + hidden_states_40, 0.1, False, False + ) + hidden_states_40 = None + add_17 = hidden_states_41 + hidden_states_39 + hidden_states_41 = hidden_states_39 = None + hidden_states_42 = torch.nn.functional.layer_norm( + add_17, + (1024,), + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_layer_norm_parameters_bias_, + 1e-12, + ) + add_17 = l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = (None) + hidden_states_43 = torch._C._nn.linear( + hidden_states_42, + l_self_modules_encoder_modules_layer_modules_5_modules_intermediate_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_5_modules_intermediate_modules_dense_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_5_modules_intermediate_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_5_modules_intermediate_modules_dense_parameters_bias_ = (None) + hidden_states_44 = torch._C._nn.gelu(hidden_states_43) + hidden_states_43 = None + hidden_states_45 = torch._C._nn.linear( + hidden_states_44, + l_self_modules_encoder_modules_layer_modules_5_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_5_modules_output_modules_dense_parameters_bias_, + ) + hidden_states_44 = l_self_modules_encoder_modules_layer_modules_5_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_5_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_46 = torch.nn.functional.dropout( + hidden_states_45, 0.1, False, False + ) + hidden_states_45 = None + add_18 = hidden_states_46 + hidden_states_42 + hidden_states_46 = hidden_states_42 = None + hidden_states_47 = torch.nn.functional.layer_norm( + add_18, + (1024,), + l_self_modules_encoder_modules_layer_modules_5_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_5_modules_output_modules_layer_norm_parameters_bias_, + 1e-12, + ) + add_18 = l_self_modules_encoder_modules_layer_modules_5_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_5_modules_output_modules_layer_norm_parameters_bias_ = (None) + query_layer_12 = torch._C._nn.linear( + hidden_states_47, + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_query_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_query_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_query_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_query_parameters_bias_ = (None) + view_24 = query_layer_12.view(1, -1, 16, 64) + query_layer_12 = None + query_layer_13 = view_24.transpose(1, 2) + view_24 = None + key_layer_12 = torch._C._nn.linear( + hidden_states_47, + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_key_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_key_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_key_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_key_parameters_bias_ = (None) + view_25 = key_layer_12.view(1, -1, 16, 64) + key_layer_12 = None + key_layer_13 = view_25.transpose(1, 2) + view_25 = None + value_layer_12 = torch._C._nn.linear( + hidden_states_47, + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_value_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_value_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_value_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_value_parameters_bias_ = (None) + view_26 = value_layer_12.view(1, -1, 16, 64) + value_layer_12 = None + value_layer_13 = view_26.transpose(1, 2) + view_26 = None + transpose_27 = key_layer_13.transpose(-1, -2) + key_layer_13 = None + attention_scores_18 = torch.matmul(query_layer_13, transpose_27) + query_layer_13 = transpose_27 = None + attention_scores_19 = attention_scores_18 / 8.0 + attention_scores_18 = None + attention_scores_20 = attention_scores_19 + extended_attention_mask_2 + attention_scores_19 = None + attention_probs_12 = torch.nn.functional.softmax(attention_scores_20, dim=-1) + attention_scores_20 = None + attention_probs_13 = torch.nn.functional.dropout( + attention_probs_12, 0.1, False, False + ) + attention_probs_12 = None + context_layer_18 = torch.matmul(attention_probs_13, value_layer_13) + attention_probs_13 = value_layer_13 = None + permute_6 = context_layer_18.permute(0, 2, 1, 3) + context_layer_18 = None + context_layer_19 = permute_6.contiguous() + permute_6 = None + context_layer_20 = context_layer_19.view((1, 21, 1024)) + context_layer_19 = None + hidden_states_48 = torch._C._nn.linear( + context_layer_20, + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_dense_parameters_bias_, + ) + context_layer_20 = l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_49 = torch.nn.functional.dropout( + hidden_states_48, 0.1, False, False + ) + hidden_states_48 = None + add_20 = hidden_states_49 + hidden_states_47 + hidden_states_49 = hidden_states_47 = None + hidden_states_50 = torch.nn.functional.layer_norm( + add_20, + (1024,), + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_layer_norm_parameters_bias_, + 1e-12, + ) + add_20 = l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = (None) + hidden_states_51 = torch._C._nn.linear( + hidden_states_50, + l_self_modules_encoder_modules_layer_modules_6_modules_intermediate_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_6_modules_intermediate_modules_dense_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_6_modules_intermediate_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_6_modules_intermediate_modules_dense_parameters_bias_ = (None) + hidden_states_52 = torch._C._nn.gelu(hidden_states_51) + hidden_states_51 = None + hidden_states_53 = torch._C._nn.linear( + hidden_states_52, + l_self_modules_encoder_modules_layer_modules_6_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_6_modules_output_modules_dense_parameters_bias_, + ) + hidden_states_52 = l_self_modules_encoder_modules_layer_modules_6_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_6_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_54 = torch.nn.functional.dropout( + hidden_states_53, 0.1, False, False + ) + hidden_states_53 = None + add_21 = hidden_states_54 + hidden_states_50 + hidden_states_54 = hidden_states_50 = None + hidden_states_55 = torch.nn.functional.layer_norm( + add_21, + (1024,), + l_self_modules_encoder_modules_layer_modules_6_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_6_modules_output_modules_layer_norm_parameters_bias_, + 1e-12, + ) + add_21 = l_self_modules_encoder_modules_layer_modules_6_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_6_modules_output_modules_layer_norm_parameters_bias_ = (None) + query_layer_14 = torch._C._nn.linear( + hidden_states_55, + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_query_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_query_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_query_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_query_parameters_bias_ = (None) + view_28 = query_layer_14.view(1, -1, 16, 64) + query_layer_14 = None + query_layer_15 = view_28.transpose(1, 2) + view_28 = None + key_layer_14 = torch._C._nn.linear( + hidden_states_55, + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_key_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_key_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_key_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_key_parameters_bias_ = (None) + view_29 = key_layer_14.view(1, -1, 16, 64) + key_layer_14 = None + key_layer_15 = view_29.transpose(1, 2) + view_29 = None + value_layer_14 = torch._C._nn.linear( + hidden_states_55, + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_value_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_value_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_value_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_value_parameters_bias_ = (None) + view_30 = value_layer_14.view(1, -1, 16, 64) + value_layer_14 = None + value_layer_15 = view_30.transpose(1, 2) + view_30 = None + transpose_31 = key_layer_15.transpose(-1, -2) + key_layer_15 = None + attention_scores_21 = torch.matmul(query_layer_15, transpose_31) + query_layer_15 = transpose_31 = None + attention_scores_22 = attention_scores_21 / 8.0 + attention_scores_21 = None + attention_scores_23 = attention_scores_22 + extended_attention_mask_2 + attention_scores_22 = None + attention_probs_14 = torch.nn.functional.softmax(attention_scores_23, dim=-1) + attention_scores_23 = None + attention_probs_15 = torch.nn.functional.dropout( + attention_probs_14, 0.1, False, False + ) + attention_probs_14 = None + context_layer_21 = torch.matmul(attention_probs_15, value_layer_15) + attention_probs_15 = value_layer_15 = None + permute_7 = context_layer_21.permute(0, 2, 1, 3) + context_layer_21 = None + context_layer_22 = permute_7.contiguous() + permute_7 = None + context_layer_23 = context_layer_22.view((1, 21, 1024)) + context_layer_22 = None + hidden_states_56 = torch._C._nn.linear( + context_layer_23, + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_dense_parameters_bias_, + ) + context_layer_23 = l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_57 = torch.nn.functional.dropout( + hidden_states_56, 0.1, False, False + ) + hidden_states_56 = None + add_23 = hidden_states_57 + hidden_states_55 + hidden_states_57 = hidden_states_55 = None + hidden_states_58 = torch.nn.functional.layer_norm( + add_23, + (1024,), + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_layer_norm_parameters_bias_, + 1e-12, + ) + add_23 = l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = (None) + hidden_states_59 = torch._C._nn.linear( + hidden_states_58, + l_self_modules_encoder_modules_layer_modules_7_modules_intermediate_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_7_modules_intermediate_modules_dense_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_7_modules_intermediate_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_7_modules_intermediate_modules_dense_parameters_bias_ = (None) + hidden_states_60 = torch._C._nn.gelu(hidden_states_59) + hidden_states_59 = None + hidden_states_61 = torch._C._nn.linear( + hidden_states_60, + l_self_modules_encoder_modules_layer_modules_7_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_7_modules_output_modules_dense_parameters_bias_, + ) + hidden_states_60 = l_self_modules_encoder_modules_layer_modules_7_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_7_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_62 = torch.nn.functional.dropout( + hidden_states_61, 0.1, False, False + ) + hidden_states_61 = None + add_24 = hidden_states_62 + hidden_states_58 + hidden_states_62 = hidden_states_58 = None + hidden_states_63 = torch.nn.functional.layer_norm( + add_24, + (1024,), + l_self_modules_encoder_modules_layer_modules_7_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_7_modules_output_modules_layer_norm_parameters_bias_, + 1e-12, + ) + add_24 = l_self_modules_encoder_modules_layer_modules_7_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_7_modules_output_modules_layer_norm_parameters_bias_ = (None) + query_layer_16 = torch._C._nn.linear( + hidden_states_63, + l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_query_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_query_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_query_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_query_parameters_bias_ = (None) + view_32 = query_layer_16.view(1, -1, 16, 64) + query_layer_16 = None + query_layer_17 = view_32.transpose(1, 2) + view_32 = None + key_layer_16 = torch._C._nn.linear( + hidden_states_63, + l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_key_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_key_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_key_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_key_parameters_bias_ = (None) + view_33 = key_layer_16.view(1, -1, 16, 64) + key_layer_16 = None + key_layer_17 = view_33.transpose(1, 2) + view_33 = None + value_layer_16 = torch._C._nn.linear( + hidden_states_63, + l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_value_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_value_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_value_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_value_parameters_bias_ = (None) + view_34 = value_layer_16.view(1, -1, 16, 64) + value_layer_16 = None + value_layer_17 = view_34.transpose(1, 2) + view_34 = None + transpose_35 = key_layer_17.transpose(-1, -2) + key_layer_17 = None + attention_scores_24 = torch.matmul(query_layer_17, transpose_35) + query_layer_17 = transpose_35 = None + attention_scores_25 = attention_scores_24 / 8.0 + attention_scores_24 = None + attention_scores_26 = attention_scores_25 + extended_attention_mask_2 + attention_scores_25 = None + attention_probs_16 = torch.nn.functional.softmax(attention_scores_26, dim=-1) + attention_scores_26 = None + attention_probs_17 = torch.nn.functional.dropout( + attention_probs_16, 0.1, False, False + ) + attention_probs_16 = None + context_layer_24 = torch.matmul(attention_probs_17, value_layer_17) + attention_probs_17 = value_layer_17 = None + permute_8 = context_layer_24.permute(0, 2, 1, 3) + context_layer_24 = None + context_layer_25 = permute_8.contiguous() + permute_8 = None + context_layer_26 = context_layer_25.view((1, 21, 1024)) + context_layer_25 = None + hidden_states_64 = torch._C._nn.linear( + context_layer_26, + l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_dense_parameters_bias_, + ) + context_layer_26 = l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_65 = torch.nn.functional.dropout( + hidden_states_64, 0.1, False, False + ) + hidden_states_64 = None + add_26 = hidden_states_65 + hidden_states_63 + hidden_states_65 = hidden_states_63 = None + hidden_states_66 = torch.nn.functional.layer_norm( + add_26, + (1024,), + l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_layer_norm_parameters_bias_, + 1e-12, + ) + add_26 = l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = (None) + hidden_states_67 = torch._C._nn.linear( + hidden_states_66, + l_self_modules_encoder_modules_layer_modules_8_modules_intermediate_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_8_modules_intermediate_modules_dense_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_8_modules_intermediate_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_8_modules_intermediate_modules_dense_parameters_bias_ = (None) + hidden_states_68 = torch._C._nn.gelu(hidden_states_67) + hidden_states_67 = None + hidden_states_69 = torch._C._nn.linear( + hidden_states_68, + l_self_modules_encoder_modules_layer_modules_8_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_8_modules_output_modules_dense_parameters_bias_, + ) + hidden_states_68 = l_self_modules_encoder_modules_layer_modules_8_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_8_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_70 = torch.nn.functional.dropout( + hidden_states_69, 0.1, False, False + ) + hidden_states_69 = None + add_27 = hidden_states_70 + hidden_states_66 + hidden_states_70 = hidden_states_66 = None + hidden_states_71 = torch.nn.functional.layer_norm( + add_27, + (1024,), + l_self_modules_encoder_modules_layer_modules_8_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_8_modules_output_modules_layer_norm_parameters_bias_, + 1e-12, + ) + add_27 = l_self_modules_encoder_modules_layer_modules_8_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_8_modules_output_modules_layer_norm_parameters_bias_ = (None) + query_layer_18 = torch._C._nn.linear( + hidden_states_71, + l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_query_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_query_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_query_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_query_parameters_bias_ = (None) + view_36 = query_layer_18.view(1, -1, 16, 64) + query_layer_18 = None + query_layer_19 = view_36.transpose(1, 2) + view_36 = None + key_layer_18 = torch._C._nn.linear( + hidden_states_71, + l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_key_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_key_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_key_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_key_parameters_bias_ = (None) + view_37 = key_layer_18.view(1, -1, 16, 64) + key_layer_18 = None + key_layer_19 = view_37.transpose(1, 2) + view_37 = None + value_layer_18 = torch._C._nn.linear( + hidden_states_71, + l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_value_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_value_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_value_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_value_parameters_bias_ = (None) + view_38 = value_layer_18.view(1, -1, 16, 64) + value_layer_18 = None + value_layer_19 = view_38.transpose(1, 2) + view_38 = None + transpose_39 = key_layer_19.transpose(-1, -2) + key_layer_19 = None + attention_scores_27 = torch.matmul(query_layer_19, transpose_39) + query_layer_19 = transpose_39 = None + attention_scores_28 = attention_scores_27 / 8.0 + attention_scores_27 = None + attention_scores_29 = attention_scores_28 + extended_attention_mask_2 + attention_scores_28 = None + attention_probs_18 = torch.nn.functional.softmax(attention_scores_29, dim=-1) + attention_scores_29 = None + attention_probs_19 = torch.nn.functional.dropout( + attention_probs_18, 0.1, False, False + ) + attention_probs_18 = None + context_layer_27 = torch.matmul(attention_probs_19, value_layer_19) + attention_probs_19 = value_layer_19 = None + permute_9 = context_layer_27.permute(0, 2, 1, 3) + context_layer_27 = None + context_layer_28 = permute_9.contiguous() + permute_9 = None + context_layer_29 = context_layer_28.view((1, 21, 1024)) + context_layer_28 = None + hidden_states_72 = torch._C._nn.linear( + context_layer_29, + l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_dense_parameters_bias_, + ) + context_layer_29 = l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_73 = torch.nn.functional.dropout( + hidden_states_72, 0.1, False, False + ) + hidden_states_72 = None + add_29 = hidden_states_73 + hidden_states_71 + hidden_states_73 = hidden_states_71 = None + hidden_states_74 = torch.nn.functional.layer_norm( + add_29, + (1024,), + l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_layer_norm_parameters_bias_, + 1e-12, + ) + add_29 = l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = (None) + hidden_states_75 = torch._C._nn.linear( + hidden_states_74, + l_self_modules_encoder_modules_layer_modules_9_modules_intermediate_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_9_modules_intermediate_modules_dense_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_9_modules_intermediate_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_9_modules_intermediate_modules_dense_parameters_bias_ = (None) + hidden_states_76 = torch._C._nn.gelu(hidden_states_75) + hidden_states_75 = None + hidden_states_77 = torch._C._nn.linear( + hidden_states_76, + l_self_modules_encoder_modules_layer_modules_9_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_9_modules_output_modules_dense_parameters_bias_, + ) + hidden_states_76 = l_self_modules_encoder_modules_layer_modules_9_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_9_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_78 = torch.nn.functional.dropout( + hidden_states_77, 0.1, False, False + ) + hidden_states_77 = None + add_30 = hidden_states_78 + hidden_states_74 + hidden_states_78 = hidden_states_74 = None + hidden_states_79 = torch.nn.functional.layer_norm( + add_30, + (1024,), + l_self_modules_encoder_modules_layer_modules_9_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_9_modules_output_modules_layer_norm_parameters_bias_, + 1e-12, + ) + add_30 = l_self_modules_encoder_modules_layer_modules_9_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_9_modules_output_modules_layer_norm_parameters_bias_ = (None) + query_layer_20 = torch._C._nn.linear( + hidden_states_79, + l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_query_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_query_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_query_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_query_parameters_bias_ = (None) + view_40 = query_layer_20.view(1, -1, 16, 64) + query_layer_20 = None + query_layer_21 = view_40.transpose(1, 2) + view_40 = None + key_layer_20 = torch._C._nn.linear( + hidden_states_79, + l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_key_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_key_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_key_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_key_parameters_bias_ = (None) + view_41 = key_layer_20.view(1, -1, 16, 64) + key_layer_20 = None + key_layer_21 = view_41.transpose(1, 2) + view_41 = None + value_layer_20 = torch._C._nn.linear( + hidden_states_79, + l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_value_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_value_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_value_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_value_parameters_bias_ = (None) + view_42 = value_layer_20.view(1, -1, 16, 64) + value_layer_20 = None + value_layer_21 = view_42.transpose(1, 2) + view_42 = None + transpose_43 = key_layer_21.transpose(-1, -2) + key_layer_21 = None + attention_scores_30 = torch.matmul(query_layer_21, transpose_43) + query_layer_21 = transpose_43 = None + attention_scores_31 = attention_scores_30 / 8.0 + attention_scores_30 = None + attention_scores_32 = attention_scores_31 + extended_attention_mask_2 + attention_scores_31 = None + attention_probs_20 = torch.nn.functional.softmax(attention_scores_32, dim=-1) + attention_scores_32 = None + attention_probs_21 = torch.nn.functional.dropout( + attention_probs_20, 0.1, False, False + ) + attention_probs_20 = None + context_layer_30 = torch.matmul(attention_probs_21, value_layer_21) + attention_probs_21 = value_layer_21 = None + permute_10 = context_layer_30.permute(0, 2, 1, 3) + context_layer_30 = None + context_layer_31 = permute_10.contiguous() + permute_10 = None + context_layer_32 = context_layer_31.view((1, 21, 1024)) + context_layer_31 = None + hidden_states_80 = torch._C._nn.linear( + context_layer_32, + l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_dense_parameters_bias_, + ) + context_layer_32 = l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_81 = torch.nn.functional.dropout( + hidden_states_80, 0.1, False, False + ) + hidden_states_80 = None + add_32 = hidden_states_81 + hidden_states_79 + hidden_states_81 = hidden_states_79 = None + hidden_states_82 = torch.nn.functional.layer_norm( + add_32, + (1024,), + l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_layer_norm_parameters_bias_, + 1e-12, + ) + add_32 = l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = (None) + hidden_states_83 = torch._C._nn.linear( + hidden_states_82, + l_self_modules_encoder_modules_layer_modules_10_modules_intermediate_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_10_modules_intermediate_modules_dense_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_10_modules_intermediate_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_10_modules_intermediate_modules_dense_parameters_bias_ = (None) + hidden_states_84 = torch._C._nn.gelu(hidden_states_83) + hidden_states_83 = None + hidden_states_85 = torch._C._nn.linear( + hidden_states_84, + l_self_modules_encoder_modules_layer_modules_10_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_10_modules_output_modules_dense_parameters_bias_, + ) + hidden_states_84 = l_self_modules_encoder_modules_layer_modules_10_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_10_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_86 = torch.nn.functional.dropout( + hidden_states_85, 0.1, False, False + ) + hidden_states_85 = None + add_33 = hidden_states_86 + hidden_states_82 + hidden_states_86 = hidden_states_82 = None + hidden_states_87 = torch.nn.functional.layer_norm( + add_33, + (1024,), + l_self_modules_encoder_modules_layer_modules_10_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_10_modules_output_modules_layer_norm_parameters_bias_, + 1e-12, + ) + add_33 = l_self_modules_encoder_modules_layer_modules_10_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_10_modules_output_modules_layer_norm_parameters_bias_ = (None) + query_layer_22 = torch._C._nn.linear( + hidden_states_87, + l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_query_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_query_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_query_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_query_parameters_bias_ = (None) + view_44 = query_layer_22.view(1, -1, 16, 64) + query_layer_22 = None + query_layer_23 = view_44.transpose(1, 2) + view_44 = None + key_layer_22 = torch._C._nn.linear( + hidden_states_87, + l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_key_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_key_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_key_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_key_parameters_bias_ = (None) + view_45 = key_layer_22.view(1, -1, 16, 64) + key_layer_22 = None + key_layer_23 = view_45.transpose(1, 2) + view_45 = None + value_layer_22 = torch._C._nn.linear( + hidden_states_87, + l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_value_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_value_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_value_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_value_parameters_bias_ = (None) + view_46 = value_layer_22.view(1, -1, 16, 64) + value_layer_22 = None + value_layer_23 = view_46.transpose(1, 2) + view_46 = None + transpose_47 = key_layer_23.transpose(-1, -2) + key_layer_23 = None + attention_scores_33 = torch.matmul(query_layer_23, transpose_47) + query_layer_23 = transpose_47 = None + attention_scores_34 = attention_scores_33 / 8.0 + attention_scores_33 = None + attention_scores_35 = attention_scores_34 + extended_attention_mask_2 + attention_scores_34 = None + attention_probs_22 = torch.nn.functional.softmax(attention_scores_35, dim=-1) + attention_scores_35 = None + attention_probs_23 = torch.nn.functional.dropout( + attention_probs_22, 0.1, False, False + ) + attention_probs_22 = None + context_layer_33 = torch.matmul(attention_probs_23, value_layer_23) + attention_probs_23 = value_layer_23 = None + permute_11 = context_layer_33.permute(0, 2, 1, 3) + context_layer_33 = None + context_layer_34 = permute_11.contiguous() + permute_11 = None + context_layer_35 = context_layer_34.view((1, 21, 1024)) + context_layer_34 = None + hidden_states_88 = torch._C._nn.linear( + context_layer_35, + l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_dense_parameters_bias_, + ) + context_layer_35 = l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_89 = torch.nn.functional.dropout( + hidden_states_88, 0.1, False, False + ) + hidden_states_88 = None + add_35 = hidden_states_89 + hidden_states_87 + hidden_states_89 = hidden_states_87 = None + hidden_states_90 = torch.nn.functional.layer_norm( + add_35, + (1024,), + l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_layer_norm_parameters_bias_, + 1e-12, + ) + add_35 = l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = (None) + hidden_states_91 = torch._C._nn.linear( + hidden_states_90, + l_self_modules_encoder_modules_layer_modules_11_modules_intermediate_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_11_modules_intermediate_modules_dense_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_11_modules_intermediate_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_11_modules_intermediate_modules_dense_parameters_bias_ = (None) + hidden_states_92 = torch._C._nn.gelu(hidden_states_91) + hidden_states_91 = None + hidden_states_93 = torch._C._nn.linear( + hidden_states_92, + l_self_modules_encoder_modules_layer_modules_11_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_11_modules_output_modules_dense_parameters_bias_, + ) + hidden_states_92 = l_self_modules_encoder_modules_layer_modules_11_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_11_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_94 = torch.nn.functional.dropout( + hidden_states_93, 0.1, False, False + ) + hidden_states_93 = None + add_36 = hidden_states_94 + hidden_states_90 + hidden_states_94 = hidden_states_90 = None + hidden_states_95 = torch.nn.functional.layer_norm( + add_36, + (1024,), + l_self_modules_encoder_modules_layer_modules_11_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_11_modules_output_modules_layer_norm_parameters_bias_, + 1e-12, + ) + add_36 = l_self_modules_encoder_modules_layer_modules_11_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_11_modules_output_modules_layer_norm_parameters_bias_ = (None) + query_layer_24 = torch._C._nn.linear( + hidden_states_95, + l_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_self_modules_query_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_self_modules_query_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_self_modules_query_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_self_modules_query_parameters_bias_ = (None) + view_48 = query_layer_24.view(1, -1, 16, 64) + query_layer_24 = None + query_layer_25 = view_48.transpose(1, 2) + view_48 = None + key_layer_24 = torch._C._nn.linear( + hidden_states_95, + l_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_self_modules_key_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_self_modules_key_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_self_modules_key_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_self_modules_key_parameters_bias_ = (None) + view_49 = key_layer_24.view(1, -1, 16, 64) + key_layer_24 = None + key_layer_25 = view_49.transpose(1, 2) + view_49 = None + value_layer_24 = torch._C._nn.linear( + hidden_states_95, + l_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_self_modules_value_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_self_modules_value_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_self_modules_value_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_self_modules_value_parameters_bias_ = (None) + view_50 = value_layer_24.view(1, -1, 16, 64) + value_layer_24 = None + value_layer_25 = view_50.transpose(1, 2) + view_50 = None + transpose_51 = key_layer_25.transpose(-1, -2) + key_layer_25 = None + attention_scores_36 = torch.matmul(query_layer_25, transpose_51) + query_layer_25 = transpose_51 = None + attention_scores_37 = attention_scores_36 / 8.0 + attention_scores_36 = None + attention_scores_38 = attention_scores_37 + extended_attention_mask_2 + attention_scores_37 = None + attention_probs_24 = torch.nn.functional.softmax(attention_scores_38, dim=-1) + attention_scores_38 = None + attention_probs_25 = torch.nn.functional.dropout( + attention_probs_24, 0.1, False, False + ) + attention_probs_24 = None + context_layer_36 = torch.matmul(attention_probs_25, value_layer_25) + attention_probs_25 = value_layer_25 = None + permute_12 = context_layer_36.permute(0, 2, 1, 3) + context_layer_36 = None + context_layer_37 = permute_12.contiguous() + permute_12 = None + context_layer_38 = context_layer_37.view((1, 21, 1024)) + context_layer_37 = None + hidden_states_96 = torch._C._nn.linear( + context_layer_38, + l_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_output_modules_dense_parameters_bias_, + ) + context_layer_38 = l_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_97 = torch.nn.functional.dropout( + hidden_states_96, 0.1, False, False + ) + hidden_states_96 = None + add_38 = hidden_states_97 + hidden_states_95 + hidden_states_97 = hidden_states_95 = None + hidden_states_98 = torch.nn.functional.layer_norm( + add_38, + (1024,), + l_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_output_modules_layer_norm_parameters_bias_, + 1e-12, + ) + add_38 = l_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = (None) + hidden_states_99 = torch._C._nn.linear( + hidden_states_98, + l_self_modules_encoder_modules_layer_modules_12_modules_intermediate_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_12_modules_intermediate_modules_dense_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_12_modules_intermediate_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_12_modules_intermediate_modules_dense_parameters_bias_ = (None) + hidden_states_100 = torch._C._nn.gelu(hidden_states_99) + hidden_states_99 = None + hidden_states_101 = torch._C._nn.linear( + hidden_states_100, + l_self_modules_encoder_modules_layer_modules_12_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_12_modules_output_modules_dense_parameters_bias_, + ) + hidden_states_100 = l_self_modules_encoder_modules_layer_modules_12_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_12_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_102 = torch.nn.functional.dropout( + hidden_states_101, 0.1, False, False + ) + hidden_states_101 = None + add_39 = hidden_states_102 + hidden_states_98 + hidden_states_102 = hidden_states_98 = None + hidden_states_103 = torch.nn.functional.layer_norm( + add_39, + (1024,), + l_self_modules_encoder_modules_layer_modules_12_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_12_modules_output_modules_layer_norm_parameters_bias_, + 1e-12, + ) + add_39 = l_self_modules_encoder_modules_layer_modules_12_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_12_modules_output_modules_layer_norm_parameters_bias_ = (None) + query_layer_26 = torch._C._nn.linear( + hidden_states_103, + l_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_self_modules_query_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_self_modules_query_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_self_modules_query_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_self_modules_query_parameters_bias_ = (None) + view_52 = query_layer_26.view(1, -1, 16, 64) + query_layer_26 = None + query_layer_27 = view_52.transpose(1, 2) + view_52 = None + key_layer_26 = torch._C._nn.linear( + hidden_states_103, + l_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_self_modules_key_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_self_modules_key_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_self_modules_key_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_self_modules_key_parameters_bias_ = (None) + view_53 = key_layer_26.view(1, -1, 16, 64) + key_layer_26 = None + key_layer_27 = view_53.transpose(1, 2) + view_53 = None + value_layer_26 = torch._C._nn.linear( + hidden_states_103, + l_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_self_modules_value_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_self_modules_value_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_self_modules_value_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_self_modules_value_parameters_bias_ = (None) + view_54 = value_layer_26.view(1, -1, 16, 64) + value_layer_26 = None + value_layer_27 = view_54.transpose(1, 2) + view_54 = None + transpose_55 = key_layer_27.transpose(-1, -2) + key_layer_27 = None + attention_scores_39 = torch.matmul(query_layer_27, transpose_55) + query_layer_27 = transpose_55 = None + attention_scores_40 = attention_scores_39 / 8.0 + attention_scores_39 = None + attention_scores_41 = attention_scores_40 + extended_attention_mask_2 + attention_scores_40 = None + attention_probs_26 = torch.nn.functional.softmax(attention_scores_41, dim=-1) + attention_scores_41 = None + attention_probs_27 = torch.nn.functional.dropout( + attention_probs_26, 0.1, False, False + ) + attention_probs_26 = None + context_layer_39 = torch.matmul(attention_probs_27, value_layer_27) + attention_probs_27 = value_layer_27 = None + permute_13 = context_layer_39.permute(0, 2, 1, 3) + context_layer_39 = None + context_layer_40 = permute_13.contiguous() + permute_13 = None + context_layer_41 = context_layer_40.view((1, 21, 1024)) + context_layer_40 = None + hidden_states_104 = torch._C._nn.linear( + context_layer_41, + l_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_output_modules_dense_parameters_bias_, + ) + context_layer_41 = l_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_105 = torch.nn.functional.dropout( + hidden_states_104, 0.1, False, False + ) + hidden_states_104 = None + add_41 = hidden_states_105 + hidden_states_103 + hidden_states_105 = hidden_states_103 = None + hidden_states_106 = torch.nn.functional.layer_norm( + add_41, + (1024,), + l_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_output_modules_layer_norm_parameters_bias_, + 1e-12, + ) + add_41 = l_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = (None) + hidden_states_107 = torch._C._nn.linear( + hidden_states_106, + l_self_modules_encoder_modules_layer_modules_13_modules_intermediate_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_13_modules_intermediate_modules_dense_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_13_modules_intermediate_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_13_modules_intermediate_modules_dense_parameters_bias_ = (None) + hidden_states_108 = torch._C._nn.gelu(hidden_states_107) + hidden_states_107 = None + hidden_states_109 = torch._C._nn.linear( + hidden_states_108, + l_self_modules_encoder_modules_layer_modules_13_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_13_modules_output_modules_dense_parameters_bias_, + ) + hidden_states_108 = l_self_modules_encoder_modules_layer_modules_13_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_13_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_110 = torch.nn.functional.dropout( + hidden_states_109, 0.1, False, False + ) + hidden_states_109 = None + add_42 = hidden_states_110 + hidden_states_106 + hidden_states_110 = hidden_states_106 = None + hidden_states_111 = torch.nn.functional.layer_norm( + add_42, + (1024,), + l_self_modules_encoder_modules_layer_modules_13_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_13_modules_output_modules_layer_norm_parameters_bias_, + 1e-12, + ) + add_42 = l_self_modules_encoder_modules_layer_modules_13_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_13_modules_output_modules_layer_norm_parameters_bias_ = (None) + query_layer_28 = torch._C._nn.linear( + hidden_states_111, + l_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_self_modules_query_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_self_modules_query_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_self_modules_query_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_self_modules_query_parameters_bias_ = (None) + view_56 = query_layer_28.view(1, -1, 16, 64) + query_layer_28 = None + query_layer_29 = view_56.transpose(1, 2) + view_56 = None + key_layer_28 = torch._C._nn.linear( + hidden_states_111, + l_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_self_modules_key_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_self_modules_key_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_self_modules_key_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_self_modules_key_parameters_bias_ = (None) + view_57 = key_layer_28.view(1, -1, 16, 64) + key_layer_28 = None + key_layer_29 = view_57.transpose(1, 2) + view_57 = None + value_layer_28 = torch._C._nn.linear( + hidden_states_111, + l_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_self_modules_value_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_self_modules_value_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_self_modules_value_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_self_modules_value_parameters_bias_ = (None) + view_58 = value_layer_28.view(1, -1, 16, 64) + value_layer_28 = None + value_layer_29 = view_58.transpose(1, 2) + view_58 = None + transpose_59 = key_layer_29.transpose(-1, -2) + key_layer_29 = None + attention_scores_42 = torch.matmul(query_layer_29, transpose_59) + query_layer_29 = transpose_59 = None + attention_scores_43 = attention_scores_42 / 8.0 + attention_scores_42 = None + attention_scores_44 = attention_scores_43 + extended_attention_mask_2 + attention_scores_43 = None + attention_probs_28 = torch.nn.functional.softmax(attention_scores_44, dim=-1) + attention_scores_44 = None + attention_probs_29 = torch.nn.functional.dropout( + attention_probs_28, 0.1, False, False + ) + attention_probs_28 = None + context_layer_42 = torch.matmul(attention_probs_29, value_layer_29) + attention_probs_29 = value_layer_29 = None + permute_14 = context_layer_42.permute(0, 2, 1, 3) + context_layer_42 = None + context_layer_43 = permute_14.contiguous() + permute_14 = None + context_layer_44 = context_layer_43.view((1, 21, 1024)) + context_layer_43 = None + hidden_states_112 = torch._C._nn.linear( + context_layer_44, + l_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_output_modules_dense_parameters_bias_, + ) + context_layer_44 = l_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_113 = torch.nn.functional.dropout( + hidden_states_112, 0.1, False, False + ) + hidden_states_112 = None + add_44 = hidden_states_113 + hidden_states_111 + hidden_states_113 = hidden_states_111 = None + hidden_states_114 = torch.nn.functional.layer_norm( + add_44, + (1024,), + l_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_output_modules_layer_norm_parameters_bias_, + 1e-12, + ) + add_44 = l_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = (None) + hidden_states_115 = torch._C._nn.linear( + hidden_states_114, + l_self_modules_encoder_modules_layer_modules_14_modules_intermediate_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_14_modules_intermediate_modules_dense_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_14_modules_intermediate_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_14_modules_intermediate_modules_dense_parameters_bias_ = (None) + hidden_states_116 = torch._C._nn.gelu(hidden_states_115) + hidden_states_115 = None + hidden_states_117 = torch._C._nn.linear( + hidden_states_116, + l_self_modules_encoder_modules_layer_modules_14_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_14_modules_output_modules_dense_parameters_bias_, + ) + hidden_states_116 = l_self_modules_encoder_modules_layer_modules_14_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_14_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_118 = torch.nn.functional.dropout( + hidden_states_117, 0.1, False, False + ) + hidden_states_117 = None + add_45 = hidden_states_118 + hidden_states_114 + hidden_states_118 = hidden_states_114 = None + hidden_states_119 = torch.nn.functional.layer_norm( + add_45, + (1024,), + l_self_modules_encoder_modules_layer_modules_14_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_14_modules_output_modules_layer_norm_parameters_bias_, + 1e-12, + ) + add_45 = l_self_modules_encoder_modules_layer_modules_14_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_14_modules_output_modules_layer_norm_parameters_bias_ = (None) + query_layer_30 = torch._C._nn.linear( + hidden_states_119, + l_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_self_modules_query_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_self_modules_query_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_self_modules_query_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_self_modules_query_parameters_bias_ = (None) + view_60 = query_layer_30.view(1, -1, 16, 64) + query_layer_30 = None + query_layer_31 = view_60.transpose(1, 2) + view_60 = None + key_layer_30 = torch._C._nn.linear( + hidden_states_119, + l_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_self_modules_key_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_self_modules_key_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_self_modules_key_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_self_modules_key_parameters_bias_ = (None) + view_61 = key_layer_30.view(1, -1, 16, 64) + key_layer_30 = None + key_layer_31 = view_61.transpose(1, 2) + view_61 = None + value_layer_30 = torch._C._nn.linear( + hidden_states_119, + l_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_self_modules_value_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_self_modules_value_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_self_modules_value_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_self_modules_value_parameters_bias_ = (None) + view_62 = value_layer_30.view(1, -1, 16, 64) + value_layer_30 = None + value_layer_31 = view_62.transpose(1, 2) + view_62 = None + transpose_63 = key_layer_31.transpose(-1, -2) + key_layer_31 = None + attention_scores_45 = torch.matmul(query_layer_31, transpose_63) + query_layer_31 = transpose_63 = None + attention_scores_46 = attention_scores_45 / 8.0 + attention_scores_45 = None + attention_scores_47 = attention_scores_46 + extended_attention_mask_2 + attention_scores_46 = None + attention_probs_30 = torch.nn.functional.softmax(attention_scores_47, dim=-1) + attention_scores_47 = None + attention_probs_31 = torch.nn.functional.dropout( + attention_probs_30, 0.1, False, False + ) + attention_probs_30 = None + context_layer_45 = torch.matmul(attention_probs_31, value_layer_31) + attention_probs_31 = value_layer_31 = None + permute_15 = context_layer_45.permute(0, 2, 1, 3) + context_layer_45 = None + context_layer_46 = permute_15.contiguous() + permute_15 = None + context_layer_47 = context_layer_46.view((1, 21, 1024)) + context_layer_46 = None + hidden_states_120 = torch._C._nn.linear( + context_layer_47, + l_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_output_modules_dense_parameters_bias_, + ) + context_layer_47 = l_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_121 = torch.nn.functional.dropout( + hidden_states_120, 0.1, False, False + ) + hidden_states_120 = None + add_47 = hidden_states_121 + hidden_states_119 + hidden_states_121 = hidden_states_119 = None + hidden_states_122 = torch.nn.functional.layer_norm( + add_47, + (1024,), + l_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_output_modules_layer_norm_parameters_bias_, + 1e-12, + ) + add_47 = l_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = (None) + hidden_states_123 = torch._C._nn.linear( + hidden_states_122, + l_self_modules_encoder_modules_layer_modules_15_modules_intermediate_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_15_modules_intermediate_modules_dense_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_15_modules_intermediate_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_15_modules_intermediate_modules_dense_parameters_bias_ = (None) + hidden_states_124 = torch._C._nn.gelu(hidden_states_123) + hidden_states_123 = None + hidden_states_125 = torch._C._nn.linear( + hidden_states_124, + l_self_modules_encoder_modules_layer_modules_15_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_15_modules_output_modules_dense_parameters_bias_, + ) + hidden_states_124 = l_self_modules_encoder_modules_layer_modules_15_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_15_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_126 = torch.nn.functional.dropout( + hidden_states_125, 0.1, False, False + ) + hidden_states_125 = None + add_48 = hidden_states_126 + hidden_states_122 + hidden_states_126 = hidden_states_122 = None + hidden_states_127 = torch.nn.functional.layer_norm( + add_48, + (1024,), + l_self_modules_encoder_modules_layer_modules_15_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_15_modules_output_modules_layer_norm_parameters_bias_, + 1e-12, + ) + add_48 = l_self_modules_encoder_modules_layer_modules_15_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_15_modules_output_modules_layer_norm_parameters_bias_ = (None) + query_layer_32 = torch._C._nn.linear( + hidden_states_127, + l_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_self_modules_query_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_self_modules_query_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_self_modules_query_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_self_modules_query_parameters_bias_ = (None) + view_64 = query_layer_32.view(1, -1, 16, 64) + query_layer_32 = None + query_layer_33 = view_64.transpose(1, 2) + view_64 = None + key_layer_32 = torch._C._nn.linear( + hidden_states_127, + l_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_self_modules_key_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_self_modules_key_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_self_modules_key_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_self_modules_key_parameters_bias_ = (None) + view_65 = key_layer_32.view(1, -1, 16, 64) + key_layer_32 = None + key_layer_33 = view_65.transpose(1, 2) + view_65 = None + value_layer_32 = torch._C._nn.linear( + hidden_states_127, + l_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_self_modules_value_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_self_modules_value_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_self_modules_value_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_self_modules_value_parameters_bias_ = (None) + view_66 = value_layer_32.view(1, -1, 16, 64) + value_layer_32 = None + value_layer_33 = view_66.transpose(1, 2) + view_66 = None + transpose_67 = key_layer_33.transpose(-1, -2) + key_layer_33 = None + attention_scores_48 = torch.matmul(query_layer_33, transpose_67) + query_layer_33 = transpose_67 = None + attention_scores_49 = attention_scores_48 / 8.0 + attention_scores_48 = None + attention_scores_50 = attention_scores_49 + extended_attention_mask_2 + attention_scores_49 = None + attention_probs_32 = torch.nn.functional.softmax(attention_scores_50, dim=-1) + attention_scores_50 = None + attention_probs_33 = torch.nn.functional.dropout( + attention_probs_32, 0.1, False, False + ) + attention_probs_32 = None + context_layer_48 = torch.matmul(attention_probs_33, value_layer_33) + attention_probs_33 = value_layer_33 = None + permute_16 = context_layer_48.permute(0, 2, 1, 3) + context_layer_48 = None + context_layer_49 = permute_16.contiguous() + permute_16 = None + context_layer_50 = context_layer_49.view((1, 21, 1024)) + context_layer_49 = None + hidden_states_128 = torch._C._nn.linear( + context_layer_50, + l_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_output_modules_dense_parameters_bias_, + ) + context_layer_50 = l_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_129 = torch.nn.functional.dropout( + hidden_states_128, 0.1, False, False + ) + hidden_states_128 = None + add_50 = hidden_states_129 + hidden_states_127 + hidden_states_129 = hidden_states_127 = None + hidden_states_130 = torch.nn.functional.layer_norm( + add_50, + (1024,), + l_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_output_modules_layer_norm_parameters_bias_, + 1e-12, + ) + add_50 = l_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = (None) + hidden_states_131 = torch._C._nn.linear( + hidden_states_130, + l_self_modules_encoder_modules_layer_modules_16_modules_intermediate_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_16_modules_intermediate_modules_dense_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_16_modules_intermediate_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_16_modules_intermediate_modules_dense_parameters_bias_ = (None) + hidden_states_132 = torch._C._nn.gelu(hidden_states_131) + hidden_states_131 = None + hidden_states_133 = torch._C._nn.linear( + hidden_states_132, + l_self_modules_encoder_modules_layer_modules_16_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_16_modules_output_modules_dense_parameters_bias_, + ) + hidden_states_132 = l_self_modules_encoder_modules_layer_modules_16_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_16_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_134 = torch.nn.functional.dropout( + hidden_states_133, 0.1, False, False + ) + hidden_states_133 = None + add_51 = hidden_states_134 + hidden_states_130 + hidden_states_134 = hidden_states_130 = None + hidden_states_135 = torch.nn.functional.layer_norm( + add_51, + (1024,), + l_self_modules_encoder_modules_layer_modules_16_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_16_modules_output_modules_layer_norm_parameters_bias_, + 1e-12, + ) + add_51 = l_self_modules_encoder_modules_layer_modules_16_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_16_modules_output_modules_layer_norm_parameters_bias_ = (None) + query_layer_34 = torch._C._nn.linear( + hidden_states_135, + l_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_self_modules_query_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_self_modules_query_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_self_modules_query_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_self_modules_query_parameters_bias_ = (None) + view_68 = query_layer_34.view(1, -1, 16, 64) + query_layer_34 = None + query_layer_35 = view_68.transpose(1, 2) + view_68 = None + key_layer_34 = torch._C._nn.linear( + hidden_states_135, + l_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_self_modules_key_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_self_modules_key_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_self_modules_key_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_self_modules_key_parameters_bias_ = (None) + view_69 = key_layer_34.view(1, -1, 16, 64) + key_layer_34 = None + key_layer_35 = view_69.transpose(1, 2) + view_69 = None + value_layer_34 = torch._C._nn.linear( + hidden_states_135, + l_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_self_modules_value_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_self_modules_value_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_self_modules_value_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_self_modules_value_parameters_bias_ = (None) + view_70 = value_layer_34.view(1, -1, 16, 64) + value_layer_34 = None + value_layer_35 = view_70.transpose(1, 2) + view_70 = None + transpose_71 = key_layer_35.transpose(-1, -2) + key_layer_35 = None + attention_scores_51 = torch.matmul(query_layer_35, transpose_71) + query_layer_35 = transpose_71 = None + attention_scores_52 = attention_scores_51 / 8.0 + attention_scores_51 = None + attention_scores_53 = attention_scores_52 + extended_attention_mask_2 + attention_scores_52 = None + attention_probs_34 = torch.nn.functional.softmax(attention_scores_53, dim=-1) + attention_scores_53 = None + attention_probs_35 = torch.nn.functional.dropout( + attention_probs_34, 0.1, False, False + ) + attention_probs_34 = None + context_layer_51 = torch.matmul(attention_probs_35, value_layer_35) + attention_probs_35 = value_layer_35 = None + permute_17 = context_layer_51.permute(0, 2, 1, 3) + context_layer_51 = None + context_layer_52 = permute_17.contiguous() + permute_17 = None + context_layer_53 = context_layer_52.view((1, 21, 1024)) + context_layer_52 = None + hidden_states_136 = torch._C._nn.linear( + context_layer_53, + l_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_output_modules_dense_parameters_bias_, + ) + context_layer_53 = l_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_137 = torch.nn.functional.dropout( + hidden_states_136, 0.1, False, False + ) + hidden_states_136 = None + add_53 = hidden_states_137 + hidden_states_135 + hidden_states_137 = hidden_states_135 = None + hidden_states_138 = torch.nn.functional.layer_norm( + add_53, + (1024,), + l_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_output_modules_layer_norm_parameters_bias_, + 1e-12, + ) + add_53 = l_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = (None) + hidden_states_139 = torch._C._nn.linear( + hidden_states_138, + l_self_modules_encoder_modules_layer_modules_17_modules_intermediate_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_17_modules_intermediate_modules_dense_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_17_modules_intermediate_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_17_modules_intermediate_modules_dense_parameters_bias_ = (None) + hidden_states_140 = torch._C._nn.gelu(hidden_states_139) + hidden_states_139 = None + hidden_states_141 = torch._C._nn.linear( + hidden_states_140, + l_self_modules_encoder_modules_layer_modules_17_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_17_modules_output_modules_dense_parameters_bias_, + ) + hidden_states_140 = l_self_modules_encoder_modules_layer_modules_17_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_17_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_142 = torch.nn.functional.dropout( + hidden_states_141, 0.1, False, False + ) + hidden_states_141 = None + add_54 = hidden_states_142 + hidden_states_138 + hidden_states_142 = hidden_states_138 = None + hidden_states_143 = torch.nn.functional.layer_norm( + add_54, + (1024,), + l_self_modules_encoder_modules_layer_modules_17_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_17_modules_output_modules_layer_norm_parameters_bias_, + 1e-12, + ) + add_54 = l_self_modules_encoder_modules_layer_modules_17_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_17_modules_output_modules_layer_norm_parameters_bias_ = (None) + query_layer_36 = torch._C._nn.linear( + hidden_states_143, + l_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_self_modules_query_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_self_modules_query_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_self_modules_query_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_self_modules_query_parameters_bias_ = (None) + view_72 = query_layer_36.view(1, -1, 16, 64) + query_layer_36 = None + query_layer_37 = view_72.transpose(1, 2) + view_72 = None + key_layer_36 = torch._C._nn.linear( + hidden_states_143, + l_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_self_modules_key_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_self_modules_key_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_self_modules_key_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_self_modules_key_parameters_bias_ = (None) + view_73 = key_layer_36.view(1, -1, 16, 64) + key_layer_36 = None + key_layer_37 = view_73.transpose(1, 2) + view_73 = None + value_layer_36 = torch._C._nn.linear( + hidden_states_143, + l_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_self_modules_value_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_self_modules_value_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_self_modules_value_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_self_modules_value_parameters_bias_ = (None) + view_74 = value_layer_36.view(1, -1, 16, 64) + value_layer_36 = None + value_layer_37 = view_74.transpose(1, 2) + view_74 = None + transpose_75 = key_layer_37.transpose(-1, -2) + key_layer_37 = None + attention_scores_54 = torch.matmul(query_layer_37, transpose_75) + query_layer_37 = transpose_75 = None + attention_scores_55 = attention_scores_54 / 8.0 + attention_scores_54 = None + attention_scores_56 = attention_scores_55 + extended_attention_mask_2 + attention_scores_55 = None + attention_probs_36 = torch.nn.functional.softmax(attention_scores_56, dim=-1) + attention_scores_56 = None + attention_probs_37 = torch.nn.functional.dropout( + attention_probs_36, 0.1, False, False + ) + attention_probs_36 = None + context_layer_54 = torch.matmul(attention_probs_37, value_layer_37) + attention_probs_37 = value_layer_37 = None + permute_18 = context_layer_54.permute(0, 2, 1, 3) + context_layer_54 = None + context_layer_55 = permute_18.contiguous() + permute_18 = None + context_layer_56 = context_layer_55.view((1, 21, 1024)) + context_layer_55 = None + hidden_states_144 = torch._C._nn.linear( + context_layer_56, + l_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_output_modules_dense_parameters_bias_, + ) + context_layer_56 = l_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_145 = torch.nn.functional.dropout( + hidden_states_144, 0.1, False, False + ) + hidden_states_144 = None + add_56 = hidden_states_145 + hidden_states_143 + hidden_states_145 = hidden_states_143 = None + hidden_states_146 = torch.nn.functional.layer_norm( + add_56, + (1024,), + l_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_output_modules_layer_norm_parameters_bias_, + 1e-12, + ) + add_56 = l_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = (None) + hidden_states_147 = torch._C._nn.linear( + hidden_states_146, + l_self_modules_encoder_modules_layer_modules_18_modules_intermediate_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_18_modules_intermediate_modules_dense_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_18_modules_intermediate_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_18_modules_intermediate_modules_dense_parameters_bias_ = (None) + hidden_states_148 = torch._C._nn.gelu(hidden_states_147) + hidden_states_147 = None + hidden_states_149 = torch._C._nn.linear( + hidden_states_148, + l_self_modules_encoder_modules_layer_modules_18_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_18_modules_output_modules_dense_parameters_bias_, + ) + hidden_states_148 = l_self_modules_encoder_modules_layer_modules_18_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_18_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_150 = torch.nn.functional.dropout( + hidden_states_149, 0.1, False, False + ) + hidden_states_149 = None + add_57 = hidden_states_150 + hidden_states_146 + hidden_states_150 = hidden_states_146 = None + hidden_states_151 = torch.nn.functional.layer_norm( + add_57, + (1024,), + l_self_modules_encoder_modules_layer_modules_18_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_18_modules_output_modules_layer_norm_parameters_bias_, + 1e-12, + ) + add_57 = l_self_modules_encoder_modules_layer_modules_18_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_18_modules_output_modules_layer_norm_parameters_bias_ = (None) + query_layer_38 = torch._C._nn.linear( + hidden_states_151, + l_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_self_modules_query_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_self_modules_query_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_self_modules_query_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_self_modules_query_parameters_bias_ = (None) + view_76 = query_layer_38.view(1, -1, 16, 64) + query_layer_38 = None + query_layer_39 = view_76.transpose(1, 2) + view_76 = None + key_layer_38 = torch._C._nn.linear( + hidden_states_151, + l_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_self_modules_key_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_self_modules_key_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_self_modules_key_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_self_modules_key_parameters_bias_ = (None) + view_77 = key_layer_38.view(1, -1, 16, 64) + key_layer_38 = None + key_layer_39 = view_77.transpose(1, 2) + view_77 = None + value_layer_38 = torch._C._nn.linear( + hidden_states_151, + l_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_self_modules_value_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_self_modules_value_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_self_modules_value_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_self_modules_value_parameters_bias_ = (None) + view_78 = value_layer_38.view(1, -1, 16, 64) + value_layer_38 = None + value_layer_39 = view_78.transpose(1, 2) + view_78 = None + transpose_79 = key_layer_39.transpose(-1, -2) + key_layer_39 = None + attention_scores_57 = torch.matmul(query_layer_39, transpose_79) + query_layer_39 = transpose_79 = None + attention_scores_58 = attention_scores_57 / 8.0 + attention_scores_57 = None + attention_scores_59 = attention_scores_58 + extended_attention_mask_2 + attention_scores_58 = None + attention_probs_38 = torch.nn.functional.softmax(attention_scores_59, dim=-1) + attention_scores_59 = None + attention_probs_39 = torch.nn.functional.dropout( + attention_probs_38, 0.1, False, False + ) + attention_probs_38 = None + context_layer_57 = torch.matmul(attention_probs_39, value_layer_39) + attention_probs_39 = value_layer_39 = None + permute_19 = context_layer_57.permute(0, 2, 1, 3) + context_layer_57 = None + context_layer_58 = permute_19.contiguous() + permute_19 = None + context_layer_59 = context_layer_58.view((1, 21, 1024)) + context_layer_58 = None + hidden_states_152 = torch._C._nn.linear( + context_layer_59, + l_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_output_modules_dense_parameters_bias_, + ) + context_layer_59 = l_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_153 = torch.nn.functional.dropout( + hidden_states_152, 0.1, False, False + ) + hidden_states_152 = None + add_59 = hidden_states_153 + hidden_states_151 + hidden_states_153 = hidden_states_151 = None + hidden_states_154 = torch.nn.functional.layer_norm( + add_59, + (1024,), + l_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_output_modules_layer_norm_parameters_bias_, + 1e-12, + ) + add_59 = l_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = (None) + hidden_states_155 = torch._C._nn.linear( + hidden_states_154, + l_self_modules_encoder_modules_layer_modules_19_modules_intermediate_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_19_modules_intermediate_modules_dense_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_19_modules_intermediate_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_19_modules_intermediate_modules_dense_parameters_bias_ = (None) + hidden_states_156 = torch._C._nn.gelu(hidden_states_155) + hidden_states_155 = None + hidden_states_157 = torch._C._nn.linear( + hidden_states_156, + l_self_modules_encoder_modules_layer_modules_19_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_19_modules_output_modules_dense_parameters_bias_, + ) + hidden_states_156 = l_self_modules_encoder_modules_layer_modules_19_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_19_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_158 = torch.nn.functional.dropout( + hidden_states_157, 0.1, False, False + ) + hidden_states_157 = None + add_60 = hidden_states_158 + hidden_states_154 + hidden_states_158 = hidden_states_154 = None + hidden_states_159 = torch.nn.functional.layer_norm( + add_60, + (1024,), + l_self_modules_encoder_modules_layer_modules_19_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_19_modules_output_modules_layer_norm_parameters_bias_, + 1e-12, + ) + add_60 = l_self_modules_encoder_modules_layer_modules_19_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_19_modules_output_modules_layer_norm_parameters_bias_ = (None) + query_layer_40 = torch._C._nn.linear( + hidden_states_159, + l_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_self_modules_query_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_self_modules_query_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_self_modules_query_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_self_modules_query_parameters_bias_ = (None) + view_80 = query_layer_40.view(1, -1, 16, 64) + query_layer_40 = None + query_layer_41 = view_80.transpose(1, 2) + view_80 = None + key_layer_40 = torch._C._nn.linear( + hidden_states_159, + l_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_self_modules_key_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_self_modules_key_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_self_modules_key_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_self_modules_key_parameters_bias_ = (None) + view_81 = key_layer_40.view(1, -1, 16, 64) + key_layer_40 = None + key_layer_41 = view_81.transpose(1, 2) + view_81 = None + value_layer_40 = torch._C._nn.linear( + hidden_states_159, + l_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_self_modules_value_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_self_modules_value_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_self_modules_value_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_self_modules_value_parameters_bias_ = (None) + view_82 = value_layer_40.view(1, -1, 16, 64) + value_layer_40 = None + value_layer_41 = view_82.transpose(1, 2) + view_82 = None + transpose_83 = key_layer_41.transpose(-1, -2) + key_layer_41 = None + attention_scores_60 = torch.matmul(query_layer_41, transpose_83) + query_layer_41 = transpose_83 = None + attention_scores_61 = attention_scores_60 / 8.0 + attention_scores_60 = None + attention_scores_62 = attention_scores_61 + extended_attention_mask_2 + attention_scores_61 = None + attention_probs_40 = torch.nn.functional.softmax(attention_scores_62, dim=-1) + attention_scores_62 = None + attention_probs_41 = torch.nn.functional.dropout( + attention_probs_40, 0.1, False, False + ) + attention_probs_40 = None + context_layer_60 = torch.matmul(attention_probs_41, value_layer_41) + attention_probs_41 = value_layer_41 = None + permute_20 = context_layer_60.permute(0, 2, 1, 3) + context_layer_60 = None + context_layer_61 = permute_20.contiguous() + permute_20 = None + context_layer_62 = context_layer_61.view((1, 21, 1024)) + context_layer_61 = None + hidden_states_160 = torch._C._nn.linear( + context_layer_62, + l_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_output_modules_dense_parameters_bias_, + ) + context_layer_62 = l_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_161 = torch.nn.functional.dropout( + hidden_states_160, 0.1, False, False + ) + hidden_states_160 = None + add_62 = hidden_states_161 + hidden_states_159 + hidden_states_161 = hidden_states_159 = None + hidden_states_162 = torch.nn.functional.layer_norm( + add_62, + (1024,), + l_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_output_modules_layer_norm_parameters_bias_, + 1e-12, + ) + add_62 = l_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = (None) + hidden_states_163 = torch._C._nn.linear( + hidden_states_162, + l_self_modules_encoder_modules_layer_modules_20_modules_intermediate_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_20_modules_intermediate_modules_dense_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_20_modules_intermediate_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_20_modules_intermediate_modules_dense_parameters_bias_ = (None) + hidden_states_164 = torch._C._nn.gelu(hidden_states_163) + hidden_states_163 = None + hidden_states_165 = torch._C._nn.linear( + hidden_states_164, + l_self_modules_encoder_modules_layer_modules_20_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_20_modules_output_modules_dense_parameters_bias_, + ) + hidden_states_164 = l_self_modules_encoder_modules_layer_modules_20_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_20_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_166 = torch.nn.functional.dropout( + hidden_states_165, 0.1, False, False + ) + hidden_states_165 = None + add_63 = hidden_states_166 + hidden_states_162 + hidden_states_166 = hidden_states_162 = None + hidden_states_167 = torch.nn.functional.layer_norm( + add_63, + (1024,), + l_self_modules_encoder_modules_layer_modules_20_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_20_modules_output_modules_layer_norm_parameters_bias_, + 1e-12, + ) + add_63 = l_self_modules_encoder_modules_layer_modules_20_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_20_modules_output_modules_layer_norm_parameters_bias_ = (None) + query_layer_42 = torch._C._nn.linear( + hidden_states_167, + l_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_self_modules_query_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_self_modules_query_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_self_modules_query_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_self_modules_query_parameters_bias_ = (None) + view_84 = query_layer_42.view(1, -1, 16, 64) + query_layer_42 = None + query_layer_43 = view_84.transpose(1, 2) + view_84 = None + key_layer_42 = torch._C._nn.linear( + hidden_states_167, + l_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_self_modules_key_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_self_modules_key_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_self_modules_key_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_self_modules_key_parameters_bias_ = (None) + view_85 = key_layer_42.view(1, -1, 16, 64) + key_layer_42 = None + key_layer_43 = view_85.transpose(1, 2) + view_85 = None + value_layer_42 = torch._C._nn.linear( + hidden_states_167, + l_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_self_modules_value_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_self_modules_value_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_self_modules_value_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_self_modules_value_parameters_bias_ = (None) + view_86 = value_layer_42.view(1, -1, 16, 64) + value_layer_42 = None + value_layer_43 = view_86.transpose(1, 2) + view_86 = None + transpose_87 = key_layer_43.transpose(-1, -2) + key_layer_43 = None + attention_scores_63 = torch.matmul(query_layer_43, transpose_87) + query_layer_43 = transpose_87 = None + attention_scores_64 = attention_scores_63 / 8.0 + attention_scores_63 = None + attention_scores_65 = attention_scores_64 + extended_attention_mask_2 + attention_scores_64 = None + attention_probs_42 = torch.nn.functional.softmax(attention_scores_65, dim=-1) + attention_scores_65 = None + attention_probs_43 = torch.nn.functional.dropout( + attention_probs_42, 0.1, False, False + ) + attention_probs_42 = None + context_layer_63 = torch.matmul(attention_probs_43, value_layer_43) + attention_probs_43 = value_layer_43 = None + permute_21 = context_layer_63.permute(0, 2, 1, 3) + context_layer_63 = None + context_layer_64 = permute_21.contiguous() + permute_21 = None + context_layer_65 = context_layer_64.view((1, 21, 1024)) + context_layer_64 = None + hidden_states_168 = torch._C._nn.linear( + context_layer_65, + l_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_output_modules_dense_parameters_bias_, + ) + context_layer_65 = l_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_169 = torch.nn.functional.dropout( + hidden_states_168, 0.1, False, False + ) + hidden_states_168 = None + add_65 = hidden_states_169 + hidden_states_167 + hidden_states_169 = hidden_states_167 = None + hidden_states_170 = torch.nn.functional.layer_norm( + add_65, + (1024,), + l_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_output_modules_layer_norm_parameters_bias_, + 1e-12, + ) + add_65 = l_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = (None) + hidden_states_171 = torch._C._nn.linear( + hidden_states_170, + l_self_modules_encoder_modules_layer_modules_21_modules_intermediate_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_21_modules_intermediate_modules_dense_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_21_modules_intermediate_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_21_modules_intermediate_modules_dense_parameters_bias_ = (None) + hidden_states_172 = torch._C._nn.gelu(hidden_states_171) + hidden_states_171 = None + hidden_states_173 = torch._C._nn.linear( + hidden_states_172, + l_self_modules_encoder_modules_layer_modules_21_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_21_modules_output_modules_dense_parameters_bias_, + ) + hidden_states_172 = l_self_modules_encoder_modules_layer_modules_21_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_21_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_174 = torch.nn.functional.dropout( + hidden_states_173, 0.1, False, False + ) + hidden_states_173 = None + add_66 = hidden_states_174 + hidden_states_170 + hidden_states_174 = hidden_states_170 = None + hidden_states_175 = torch.nn.functional.layer_norm( + add_66, + (1024,), + l_self_modules_encoder_modules_layer_modules_21_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_21_modules_output_modules_layer_norm_parameters_bias_, + 1e-12, + ) + add_66 = l_self_modules_encoder_modules_layer_modules_21_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_21_modules_output_modules_layer_norm_parameters_bias_ = (None) + query_layer_44 = torch._C._nn.linear( + hidden_states_175, + l_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_self_modules_query_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_self_modules_query_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_self_modules_query_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_self_modules_query_parameters_bias_ = (None) + view_88 = query_layer_44.view(1, -1, 16, 64) + query_layer_44 = None + query_layer_45 = view_88.transpose(1, 2) + view_88 = None + key_layer_44 = torch._C._nn.linear( + hidden_states_175, + l_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_self_modules_key_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_self_modules_key_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_self_modules_key_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_self_modules_key_parameters_bias_ = (None) + view_89 = key_layer_44.view(1, -1, 16, 64) + key_layer_44 = None + key_layer_45 = view_89.transpose(1, 2) + view_89 = None + value_layer_44 = torch._C._nn.linear( + hidden_states_175, + l_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_self_modules_value_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_self_modules_value_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_self_modules_value_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_self_modules_value_parameters_bias_ = (None) + view_90 = value_layer_44.view(1, -1, 16, 64) + value_layer_44 = None + value_layer_45 = view_90.transpose(1, 2) + view_90 = None + transpose_91 = key_layer_45.transpose(-1, -2) + key_layer_45 = None + attention_scores_66 = torch.matmul(query_layer_45, transpose_91) + query_layer_45 = transpose_91 = None + attention_scores_67 = attention_scores_66 / 8.0 + attention_scores_66 = None + attention_scores_68 = attention_scores_67 + extended_attention_mask_2 + attention_scores_67 = None + attention_probs_44 = torch.nn.functional.softmax(attention_scores_68, dim=-1) + attention_scores_68 = None + attention_probs_45 = torch.nn.functional.dropout( + attention_probs_44, 0.1, False, False + ) + attention_probs_44 = None + context_layer_66 = torch.matmul(attention_probs_45, value_layer_45) + attention_probs_45 = value_layer_45 = None + permute_22 = context_layer_66.permute(0, 2, 1, 3) + context_layer_66 = None + context_layer_67 = permute_22.contiguous() + permute_22 = None + context_layer_68 = context_layer_67.view((1, 21, 1024)) + context_layer_67 = None + hidden_states_176 = torch._C._nn.linear( + context_layer_68, + l_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_output_modules_dense_parameters_bias_, + ) + context_layer_68 = l_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_177 = torch.nn.functional.dropout( + hidden_states_176, 0.1, False, False + ) + hidden_states_176 = None + add_68 = hidden_states_177 + hidden_states_175 + hidden_states_177 = hidden_states_175 = None + hidden_states_178 = torch.nn.functional.layer_norm( + add_68, + (1024,), + l_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_output_modules_layer_norm_parameters_bias_, + 1e-12, + ) + add_68 = l_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = (None) + hidden_states_179 = torch._C._nn.linear( + hidden_states_178, + l_self_modules_encoder_modules_layer_modules_22_modules_intermediate_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_22_modules_intermediate_modules_dense_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_22_modules_intermediate_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_22_modules_intermediate_modules_dense_parameters_bias_ = (None) + hidden_states_180 = torch._C._nn.gelu(hidden_states_179) + hidden_states_179 = None + hidden_states_181 = torch._C._nn.linear( + hidden_states_180, + l_self_modules_encoder_modules_layer_modules_22_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_22_modules_output_modules_dense_parameters_bias_, + ) + hidden_states_180 = l_self_modules_encoder_modules_layer_modules_22_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_22_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_182 = torch.nn.functional.dropout( + hidden_states_181, 0.1, False, False + ) + hidden_states_181 = None + add_69 = hidden_states_182 + hidden_states_178 + hidden_states_182 = hidden_states_178 = None + hidden_states_183 = torch.nn.functional.layer_norm( + add_69, + (1024,), + l_self_modules_encoder_modules_layer_modules_22_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_22_modules_output_modules_layer_norm_parameters_bias_, + 1e-12, + ) + add_69 = l_self_modules_encoder_modules_layer_modules_22_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_22_modules_output_modules_layer_norm_parameters_bias_ = (None) + query_layer_46 = torch._C._nn.linear( + hidden_states_183, + l_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_self_modules_query_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_self_modules_query_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_self_modules_query_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_self_modules_query_parameters_bias_ = (None) + view_92 = query_layer_46.view(1, -1, 16, 64) + query_layer_46 = None + query_layer_47 = view_92.transpose(1, 2) + view_92 = None + key_layer_46 = torch._C._nn.linear( + hidden_states_183, + l_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_self_modules_key_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_self_modules_key_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_self_modules_key_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_self_modules_key_parameters_bias_ = (None) + view_93 = key_layer_46.view(1, -1, 16, 64) + key_layer_46 = None + key_layer_47 = view_93.transpose(1, 2) + view_93 = None + value_layer_46 = torch._C._nn.linear( + hidden_states_183, + l_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_self_modules_value_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_self_modules_value_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_self_modules_value_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_self_modules_value_parameters_bias_ = (None) + view_94 = value_layer_46.view(1, -1, 16, 64) + value_layer_46 = None + value_layer_47 = view_94.transpose(1, 2) + view_94 = None + transpose_95 = key_layer_47.transpose(-1, -2) + key_layer_47 = None + attention_scores_69 = torch.matmul(query_layer_47, transpose_95) + query_layer_47 = transpose_95 = None + attention_scores_70 = attention_scores_69 / 8.0 + attention_scores_69 = None + attention_scores_71 = attention_scores_70 + extended_attention_mask_2 + attention_scores_70 = extended_attention_mask_2 = None + attention_probs_46 = torch.nn.functional.softmax(attention_scores_71, dim=-1) + attention_scores_71 = None + attention_probs_47 = torch.nn.functional.dropout( + attention_probs_46, 0.1, False, False + ) + attention_probs_46 = None + context_layer_69 = torch.matmul(attention_probs_47, value_layer_47) + attention_probs_47 = value_layer_47 = None + permute_23 = context_layer_69.permute(0, 2, 1, 3) + context_layer_69 = None + context_layer_70 = permute_23.contiguous() + permute_23 = None + context_layer_71 = context_layer_70.view((1, 21, 1024)) + context_layer_70 = None + hidden_states_184 = torch._C._nn.linear( + context_layer_71, + l_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_output_modules_dense_parameters_bias_, + ) + context_layer_71 = l_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_185 = torch.nn.functional.dropout( + hidden_states_184, 0.1, False, False + ) + hidden_states_184 = None + add_71 = hidden_states_185 + hidden_states_183 + hidden_states_185 = hidden_states_183 = None + hidden_states_186 = torch.nn.functional.layer_norm( + add_71, + (1024,), + l_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_output_modules_layer_norm_parameters_bias_, + 1e-12, + ) + add_71 = l_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = (None) + hidden_states_187 = torch._C._nn.linear( + hidden_states_186, + l_self_modules_encoder_modules_layer_modules_23_modules_intermediate_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_23_modules_intermediate_modules_dense_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_23_modules_intermediate_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_23_modules_intermediate_modules_dense_parameters_bias_ = (None) + hidden_states_188 = torch._C._nn.gelu(hidden_states_187) + hidden_states_187 = None + hidden_states_189 = torch._C._nn.linear( + hidden_states_188, + l_self_modules_encoder_modules_layer_modules_23_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_23_modules_output_modules_dense_parameters_bias_, + ) + hidden_states_188 = l_self_modules_encoder_modules_layer_modules_23_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_23_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_190 = torch.nn.functional.dropout( + hidden_states_189, 0.1, False, False + ) + hidden_states_189 = None + add_72 = hidden_states_190 + hidden_states_186 + hidden_states_190 = hidden_states_186 = None + hidden_states_191 = torch.nn.functional.layer_norm( + add_72, + (1024,), + l_self_modules_encoder_modules_layer_modules_23_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_23_modules_output_modules_layer_norm_parameters_bias_, + 1e-12, + ) + add_72 = l_self_modules_encoder_modules_layer_modules_23_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_23_modules_output_modules_layer_norm_parameters_bias_ = (None) + return (hidden_states_191,) diff --git a/samples/transformers-auto-model/ashique_BanglaTraitBERT/weight_meta.py b/samples/transformers-auto-model/ashique_BanglaTraitBERT/weight_meta.py new file mode 100644 index 000000000..91d256151 --- /dev/null +++ b/samples/transformers-auto-model/ashique_BanglaTraitBERT/weight_meta.py @@ -0,0 +1,3951 @@ +class Program_weight_tensor_meta_L_input_ids_: + name = "L_input_ids_" + shape = [1, 21] + dtype = "torch.int64" + device = "cuda:0" + mean = None + std = None + data = [ + 2, + 16018, + 4606, + 446, + 480, + 2289, + 3154, + 26255, + 2017, + 23870, + 4084, + 4110, + 10349, + 2709, + 10790, + 11166, + 4606, + 446, + 478, + 18, + 3, + ] + + +class Program_weight_tensor_meta_L_attention_mask_: + name = "L_attention_mask_" + shape = [1, 21] + dtype = "torch.int64" + device = "cuda:0" + mean = None + std = None + data = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] + + +class Program_weight_tensor_meta_L_token_type_ids_: + name = "L_token_type_ids_" + shape = [1, 21] + dtype = "torch.int64" + device = "cuda:0" + mean = None + std = None + data = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] + + +class Program_weight_tensor_meta_L_self_modules_embeddings_modules_word_embeddings_parameters_weight_: + name = "L_self_modules_embeddings_modules_word_embeddings_parameters_weight_" + shape = [32000, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_embeddings_buffers_position_ids_: + name = "L_self_modules_embeddings_buffers_position_ids_" + shape = [1, 512] + dtype = "torch.int64" + device = "cuda:0" + mean = None + std = None + min_val = 0 + max_val = 511 + + +class Program_weight_tensor_meta_L_self_modules_embeddings_modules_token_type_embeddings_parameters_weight_: + name = "L_self_modules_embeddings_modules_token_type_embeddings_parameters_weight_" + shape = [2, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_embeddings_modules_position_embeddings_parameters_weight_: + name = "L_self_modules_embeddings_modules_position_embeddings_parameters_weight_" + shape = [512, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_embeddings_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_embeddings_modules_LayerNorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_embeddings_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_embeddings_modules_LayerNorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_query_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_query_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_query_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_query_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_key_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_key_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_key_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_key_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_value_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_value_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_value_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_value_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dense_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_LayerNorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_LayerNorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_intermediate_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_intermediate_modules_dense_parameters_weight_" + shape = [4096, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_intermediate_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_intermediate_modules_dense_parameters_bias_" + shape = [4096] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dense_parameters_weight_" + shape = [1024, 4096] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_LayerNorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_LayerNorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_query_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_query_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_query_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_query_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_key_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_key_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_key_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_key_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_value_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_value_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_value_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_value_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dense_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_LayerNorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_LayerNorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_intermediate_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_intermediate_modules_dense_parameters_weight_" + shape = [4096, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_intermediate_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_intermediate_modules_dense_parameters_bias_" + shape = [4096] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dense_parameters_weight_" + shape = [1024, 4096] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_LayerNorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_LayerNorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_query_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_query_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_query_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_query_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_key_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_key_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_key_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_key_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_value_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_value_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_value_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_value_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_dense_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_LayerNorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_LayerNorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_intermediate_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_intermediate_modules_dense_parameters_weight_" + shape = [4096, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_intermediate_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_intermediate_modules_dense_parameters_bias_" + shape = [4096] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_dense_parameters_weight_" + shape = [1024, 4096] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_LayerNorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_LayerNorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_query_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_query_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_query_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_query_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_key_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_key_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_key_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_key_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_value_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_value_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_value_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_value_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_dense_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_LayerNorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_LayerNorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_intermediate_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_intermediate_modules_dense_parameters_weight_" + shape = [4096, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_intermediate_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_intermediate_modules_dense_parameters_bias_" + shape = [4096] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_dense_parameters_weight_" + shape = [1024, 4096] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_LayerNorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_LayerNorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_query_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_query_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_query_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_query_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_key_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_key_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_key_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_key_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_value_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_value_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_value_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_value_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_dense_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_LayerNorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_LayerNorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_4_modules_intermediate_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_4_modules_intermediate_modules_dense_parameters_weight_" + shape = [4096, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_4_modules_intermediate_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_4_modules_intermediate_modules_dense_parameters_bias_" + shape = [4096] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_4_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_4_modules_output_modules_dense_parameters_weight_" + shape = [1024, 4096] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_4_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_4_modules_output_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_4_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_4_modules_output_modules_LayerNorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_4_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_4_modules_output_modules_LayerNorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_query_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_query_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_query_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_query_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_key_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_key_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_key_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_key_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_value_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_value_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_value_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_value_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_dense_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_LayerNorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_LayerNorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_5_modules_intermediate_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_5_modules_intermediate_modules_dense_parameters_weight_" + shape = [4096, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_5_modules_intermediate_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_5_modules_intermediate_modules_dense_parameters_bias_" + shape = [4096] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_5_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_5_modules_output_modules_dense_parameters_weight_" + shape = [1024, 4096] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_5_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_5_modules_output_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_5_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_5_modules_output_modules_LayerNorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_5_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_5_modules_output_modules_LayerNorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_query_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_query_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_query_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_query_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_key_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_key_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_key_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_key_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_value_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_value_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_value_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_value_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_dense_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_LayerNorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_LayerNorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_6_modules_intermediate_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_6_modules_intermediate_modules_dense_parameters_weight_" + shape = [4096, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_6_modules_intermediate_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_6_modules_intermediate_modules_dense_parameters_bias_" + shape = [4096] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_6_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_6_modules_output_modules_dense_parameters_weight_" + shape = [1024, 4096] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_6_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_6_modules_output_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_6_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_6_modules_output_modules_LayerNorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_6_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_6_modules_output_modules_LayerNorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_query_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_query_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_query_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_query_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_key_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_key_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_key_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_key_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_value_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_value_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_value_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_value_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_dense_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_LayerNorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_LayerNorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_7_modules_intermediate_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_7_modules_intermediate_modules_dense_parameters_weight_" + shape = [4096, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_7_modules_intermediate_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_7_modules_intermediate_modules_dense_parameters_bias_" + shape = [4096] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_7_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_7_modules_output_modules_dense_parameters_weight_" + shape = [1024, 4096] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_7_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_7_modules_output_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_7_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_7_modules_output_modules_LayerNorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_7_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_7_modules_output_modules_LayerNorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_query_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_query_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_query_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_query_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_key_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_key_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_key_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_key_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_value_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_value_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_value_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_value_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_dense_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_LayerNorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_LayerNorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_8_modules_intermediate_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_8_modules_intermediate_modules_dense_parameters_weight_" + shape = [4096, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_8_modules_intermediate_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_8_modules_intermediate_modules_dense_parameters_bias_" + shape = [4096] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_8_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_8_modules_output_modules_dense_parameters_weight_" + shape = [1024, 4096] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_8_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_8_modules_output_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_8_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_8_modules_output_modules_LayerNorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_8_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_8_modules_output_modules_LayerNorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_query_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_query_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_query_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_query_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_key_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_key_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_key_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_key_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_value_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_value_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_value_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_value_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_dense_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_LayerNorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_LayerNorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_9_modules_intermediate_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_9_modules_intermediate_modules_dense_parameters_weight_" + shape = [4096, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_9_modules_intermediate_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_9_modules_intermediate_modules_dense_parameters_bias_" + shape = [4096] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_9_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_9_modules_output_modules_dense_parameters_weight_" + shape = [1024, 4096] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_9_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_9_modules_output_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_9_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_9_modules_output_modules_LayerNorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_9_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_9_modules_output_modules_LayerNorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_query_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_query_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_query_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_query_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_key_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_key_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_key_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_key_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_value_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_value_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_value_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_value_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_dense_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_LayerNorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_LayerNorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_10_modules_intermediate_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_10_modules_intermediate_modules_dense_parameters_weight_" + shape = [4096, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_10_modules_intermediate_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_10_modules_intermediate_modules_dense_parameters_bias_" + shape = [4096] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_10_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_10_modules_output_modules_dense_parameters_weight_" + shape = [1024, 4096] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_10_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_10_modules_output_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_10_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_10_modules_output_modules_LayerNorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_10_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_10_modules_output_modules_LayerNorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_query_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_query_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_query_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_query_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_key_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_key_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_key_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_key_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_value_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_value_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_value_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_value_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_dense_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_LayerNorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_LayerNorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_11_modules_intermediate_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_11_modules_intermediate_modules_dense_parameters_weight_" + shape = [4096, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_11_modules_intermediate_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_11_modules_intermediate_modules_dense_parameters_bias_" + shape = [4096] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_11_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_11_modules_output_modules_dense_parameters_weight_" + shape = [1024, 4096] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_11_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_11_modules_output_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_11_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_11_modules_output_modules_LayerNorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_11_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_11_modules_output_modules_LayerNorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_self_modules_query_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_self_modules_query_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_self_modules_query_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_self_modules_query_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_self_modules_key_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_self_modules_key_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_self_modules_key_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_self_modules_key_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_self_modules_value_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_self_modules_value_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_self_modules_value_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_self_modules_value_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_output_modules_dense_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_output_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_output_modules_LayerNorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_output_modules_LayerNorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_12_modules_intermediate_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_12_modules_intermediate_modules_dense_parameters_weight_" + shape = [4096, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_12_modules_intermediate_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_12_modules_intermediate_modules_dense_parameters_bias_" + shape = [4096] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_12_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_12_modules_output_modules_dense_parameters_weight_" + shape = [1024, 4096] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_12_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_12_modules_output_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_12_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_12_modules_output_modules_LayerNorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_12_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_12_modules_output_modules_LayerNorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_self_modules_query_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_self_modules_query_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_self_modules_query_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_self_modules_query_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_self_modules_key_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_self_modules_key_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_self_modules_key_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_self_modules_key_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_self_modules_value_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_self_modules_value_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_self_modules_value_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_self_modules_value_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_output_modules_dense_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_output_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_output_modules_LayerNorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_output_modules_LayerNorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_13_modules_intermediate_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_13_modules_intermediate_modules_dense_parameters_weight_" + shape = [4096, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_13_modules_intermediate_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_13_modules_intermediate_modules_dense_parameters_bias_" + shape = [4096] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_13_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_13_modules_output_modules_dense_parameters_weight_" + shape = [1024, 4096] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_13_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_13_modules_output_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_13_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_13_modules_output_modules_LayerNorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_13_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_13_modules_output_modules_LayerNorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_self_modules_query_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_self_modules_query_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_self_modules_query_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_self_modules_query_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_self_modules_key_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_self_modules_key_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_self_modules_key_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_self_modules_key_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_self_modules_value_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_self_modules_value_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_self_modules_value_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_self_modules_value_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_output_modules_dense_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_output_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_output_modules_LayerNorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_output_modules_LayerNorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_14_modules_intermediate_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_14_modules_intermediate_modules_dense_parameters_weight_" + shape = [4096, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_14_modules_intermediate_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_14_modules_intermediate_modules_dense_parameters_bias_" + shape = [4096] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_14_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_14_modules_output_modules_dense_parameters_weight_" + shape = [1024, 4096] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_14_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_14_modules_output_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_14_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_14_modules_output_modules_LayerNorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_14_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_14_modules_output_modules_LayerNorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_self_modules_query_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_self_modules_query_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_self_modules_query_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_self_modules_query_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_self_modules_key_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_self_modules_key_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_self_modules_key_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_self_modules_key_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_self_modules_value_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_self_modules_value_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_self_modules_value_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_self_modules_value_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_output_modules_dense_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_output_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_output_modules_LayerNorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_output_modules_LayerNorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_15_modules_intermediate_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_15_modules_intermediate_modules_dense_parameters_weight_" + shape = [4096, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_15_modules_intermediate_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_15_modules_intermediate_modules_dense_parameters_bias_" + shape = [4096] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_15_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_15_modules_output_modules_dense_parameters_weight_" + shape = [1024, 4096] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_15_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_15_modules_output_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_15_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_15_modules_output_modules_LayerNorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_15_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_15_modules_output_modules_LayerNorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_self_modules_query_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_self_modules_query_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_self_modules_query_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_self_modules_query_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_self_modules_key_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_self_modules_key_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_self_modules_key_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_self_modules_key_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_self_modules_value_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_self_modules_value_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_self_modules_value_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_self_modules_value_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_output_modules_dense_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_output_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_output_modules_LayerNorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_output_modules_LayerNorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_16_modules_intermediate_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_16_modules_intermediate_modules_dense_parameters_weight_" + shape = [4096, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_16_modules_intermediate_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_16_modules_intermediate_modules_dense_parameters_bias_" + shape = [4096] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_16_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_16_modules_output_modules_dense_parameters_weight_" + shape = [1024, 4096] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_16_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_16_modules_output_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_16_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_16_modules_output_modules_LayerNorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_16_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_16_modules_output_modules_LayerNorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_self_modules_query_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_self_modules_query_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_self_modules_query_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_self_modules_query_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_self_modules_key_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_self_modules_key_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_self_modules_key_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_self_modules_key_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_self_modules_value_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_self_modules_value_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_self_modules_value_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_self_modules_value_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_output_modules_dense_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_output_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_output_modules_LayerNorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_output_modules_LayerNorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_17_modules_intermediate_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_17_modules_intermediate_modules_dense_parameters_weight_" + shape = [4096, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_17_modules_intermediate_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_17_modules_intermediate_modules_dense_parameters_bias_" + shape = [4096] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_17_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_17_modules_output_modules_dense_parameters_weight_" + shape = [1024, 4096] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_17_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_17_modules_output_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_17_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_17_modules_output_modules_LayerNorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_17_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_17_modules_output_modules_LayerNorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_self_modules_query_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_self_modules_query_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_self_modules_query_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_self_modules_query_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_self_modules_key_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_self_modules_key_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_self_modules_key_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_self_modules_key_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_self_modules_value_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_self_modules_value_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_self_modules_value_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_self_modules_value_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_output_modules_dense_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_output_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_output_modules_LayerNorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_output_modules_LayerNorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_18_modules_intermediate_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_18_modules_intermediate_modules_dense_parameters_weight_" + shape = [4096, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_18_modules_intermediate_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_18_modules_intermediate_modules_dense_parameters_bias_" + shape = [4096] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_18_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_18_modules_output_modules_dense_parameters_weight_" + shape = [1024, 4096] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_18_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_18_modules_output_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_18_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_18_modules_output_modules_LayerNorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_18_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_18_modules_output_modules_LayerNorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_self_modules_query_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_self_modules_query_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_self_modules_query_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_self_modules_query_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_self_modules_key_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_self_modules_key_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_self_modules_key_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_self_modules_key_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_self_modules_value_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_self_modules_value_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_self_modules_value_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_self_modules_value_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_output_modules_dense_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_output_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_output_modules_LayerNorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_output_modules_LayerNorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_19_modules_intermediate_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_19_modules_intermediate_modules_dense_parameters_weight_" + shape = [4096, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_19_modules_intermediate_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_19_modules_intermediate_modules_dense_parameters_bias_" + shape = [4096] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_19_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_19_modules_output_modules_dense_parameters_weight_" + shape = [1024, 4096] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_19_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_19_modules_output_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_19_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_19_modules_output_modules_LayerNorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_19_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_19_modules_output_modules_LayerNorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_self_modules_query_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_self_modules_query_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_self_modules_query_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_self_modules_query_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_self_modules_key_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_self_modules_key_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_self_modules_key_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_self_modules_key_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_self_modules_value_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_self_modules_value_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_self_modules_value_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_self_modules_value_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_output_modules_dense_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_output_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_output_modules_LayerNorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_output_modules_LayerNorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_20_modules_intermediate_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_20_modules_intermediate_modules_dense_parameters_weight_" + shape = [4096, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_20_modules_intermediate_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_20_modules_intermediate_modules_dense_parameters_bias_" + shape = [4096] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_20_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_20_modules_output_modules_dense_parameters_weight_" + shape = [1024, 4096] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_20_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_20_modules_output_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_20_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_20_modules_output_modules_LayerNorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_20_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_20_modules_output_modules_LayerNorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_self_modules_query_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_self_modules_query_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_self_modules_query_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_self_modules_query_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_self_modules_key_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_self_modules_key_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_self_modules_key_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_self_modules_key_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_self_modules_value_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_self_modules_value_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_self_modules_value_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_self_modules_value_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_output_modules_dense_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_output_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_output_modules_LayerNorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_output_modules_LayerNorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_21_modules_intermediate_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_21_modules_intermediate_modules_dense_parameters_weight_" + shape = [4096, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_21_modules_intermediate_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_21_modules_intermediate_modules_dense_parameters_bias_" + shape = [4096] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_21_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_21_modules_output_modules_dense_parameters_weight_" + shape = [1024, 4096] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_21_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_21_modules_output_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_21_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_21_modules_output_modules_LayerNorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_21_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_21_modules_output_modules_LayerNorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_self_modules_query_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_self_modules_query_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_self_modules_query_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_self_modules_query_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_self_modules_key_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_self_modules_key_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_self_modules_key_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_self_modules_key_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_self_modules_value_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_self_modules_value_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_self_modules_value_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_self_modules_value_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_output_modules_dense_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_output_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_output_modules_LayerNorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_output_modules_LayerNorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_22_modules_intermediate_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_22_modules_intermediate_modules_dense_parameters_weight_" + shape = [4096, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_22_modules_intermediate_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_22_modules_intermediate_modules_dense_parameters_bias_" + shape = [4096] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_22_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_22_modules_output_modules_dense_parameters_weight_" + shape = [1024, 4096] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_22_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_22_modules_output_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_22_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_22_modules_output_modules_LayerNorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_22_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_22_modules_output_modules_LayerNorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_self_modules_query_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_self_modules_query_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_self_modules_query_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_self_modules_query_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_self_modules_key_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_self_modules_key_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_self_modules_key_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_self_modules_key_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_self_modules_value_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_self_modules_value_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_self_modules_value_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_self_modules_value_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_output_modules_dense_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_output_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_output_modules_LayerNorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_output_modules_LayerNorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_23_modules_intermediate_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_23_modules_intermediate_modules_dense_parameters_weight_" + shape = [4096, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_23_modules_intermediate_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_23_modules_intermediate_modules_dense_parameters_bias_" + shape = [4096] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_23_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_23_modules_output_modules_dense_parameters_weight_" + shape = [1024, 4096] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_23_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_23_modules_output_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_23_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_23_modules_output_modules_LayerNorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_23_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_23_modules_output_modules_LayerNorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None diff --git a/samples/transformers-auto-model/fusersam_Sentiment-Analysis-Model/graph_hash.txt b/samples/transformers-auto-model/fusersam_Sentiment-Analysis-Model/graph_hash.txt new file mode 100644 index 000000000..5dab9e34e --- /dev/null +++ b/samples/transformers-auto-model/fusersam_Sentiment-Analysis-Model/graph_hash.txt @@ -0,0 +1 @@ +044db93fa7108425565776cb5af513f966ddd7448dbcf0043be396cbd01eeeb9 \ No newline at end of file diff --git a/samples/transformers-auto-model/fusersam_Sentiment-Analysis-Model/graph_net.json b/samples/transformers-auto-model/fusersam_Sentiment-Analysis-Model/graph_net.json new file mode 100644 index 000000000..b6ffe9f72 --- /dev/null +++ b/samples/transformers-auto-model/fusersam_Sentiment-Analysis-Model/graph_net.json @@ -0,0 +1,6 @@ +{ + "framework": "torch", + "num_devices_required": 1, + "num_nodes_required": 1, + "dynamic": false +} \ No newline at end of file diff --git a/samples/transformers-auto-model/fusersam_Sentiment-Analysis-Model/input_meta.py b/samples/transformers-auto-model/fusersam_Sentiment-Analysis-Model/input_meta.py new file mode 100644 index 000000000..e69de29bb diff --git a/samples/transformers-auto-model/fusersam_Sentiment-Analysis-Model/input_tensor_constraints.py b/samples/transformers-auto-model/fusersam_Sentiment-Analysis-Model/input_tensor_constraints.py new file mode 100644 index 000000000..e69de29bb diff --git a/samples/transformers-auto-model/fusersam_Sentiment-Analysis-Model/model.py b/samples/transformers-auto-model/fusersam_Sentiment-Analysis-Model/model.py new file mode 100644 index 000000000..a152a96b1 --- /dev/null +++ b/samples/transformers-auto-model/fusersam_Sentiment-Analysis-Model/model.py @@ -0,0 +1,1818 @@ +import torch + + +class GraphModule(torch.nn.Module): + def forward( + self, + L_input_ids_: torch.Tensor, + L_self_modules_embeddings_buffers_token_type_ids_: torch.Tensor, + L_self_modules_embeddings_modules_word_embeddings_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_embeddings_modules_token_type_embeddings_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_embeddings_modules_position_embeddings_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_embeddings_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_embeddings_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_embeddings_modules_dropout_p: torch.Tensor, + L_attention_mask_: torch.Tensor, + L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_query_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_query_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_key_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_key_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dropout_p: torch.Tensor, + L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_intermediate_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_intermediate_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dropout_p: torch.Tensor, + L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_query_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_query_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_key_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_key_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dropout_p: torch.Tensor, + L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_intermediate_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_intermediate_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dropout_p: torch.Tensor, + L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_query_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_query_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_key_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_key_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_dropout_p: torch.Tensor, + L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_intermediate_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_intermediate_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_dropout_p: torch.Tensor, + L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_query_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_query_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_key_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_key_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_dropout_p: torch.Tensor, + L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_intermediate_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_intermediate_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_dropout_p: torch.Tensor, + L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_query_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_query_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_key_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_key_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_dropout_p: torch.Tensor, + L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_4_modules_intermediate_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_4_modules_intermediate_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_4_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_4_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_4_modules_output_modules_dropout_p: torch.Tensor, + L_self_modules_encoder_modules_layer_modules_4_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_4_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_query_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_query_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_key_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_key_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_dropout_p: torch.Tensor, + L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_5_modules_intermediate_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_5_modules_intermediate_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_5_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_5_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_5_modules_output_modules_dropout_p: torch.Tensor, + L_self_modules_encoder_modules_layer_modules_5_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_5_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_query_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_query_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_key_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_key_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_dropout_p: torch.Tensor, + L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_6_modules_intermediate_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_6_modules_intermediate_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_6_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_6_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_6_modules_output_modules_dropout_p: torch.Tensor, + L_self_modules_encoder_modules_layer_modules_6_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_6_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_query_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_query_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_key_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_key_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_dropout_p: torch.Tensor, + L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_7_modules_intermediate_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_7_modules_intermediate_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_7_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_7_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_7_modules_output_modules_dropout_p: torch.Tensor, + L_self_modules_encoder_modules_layer_modules_7_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_7_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_query_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_query_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_key_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_key_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_dropout_p: torch.Tensor, + L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_8_modules_intermediate_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_8_modules_intermediate_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_8_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_8_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_8_modules_output_modules_dropout_p: torch.Tensor, + L_self_modules_encoder_modules_layer_modules_8_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_8_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_query_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_query_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_key_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_key_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_dropout_p: torch.Tensor, + L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_9_modules_intermediate_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_9_modules_intermediate_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_9_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_9_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_9_modules_output_modules_dropout_p: torch.Tensor, + L_self_modules_encoder_modules_layer_modules_9_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_9_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_query_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_query_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_key_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_key_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_dropout_p: torch.Tensor, + L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_10_modules_intermediate_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_10_modules_intermediate_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_10_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_10_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_10_modules_output_modules_dropout_p: torch.Tensor, + L_self_modules_encoder_modules_layer_modules_10_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_10_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_query_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_query_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_key_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_key_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_dropout_p: torch.Tensor, + L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_11_modules_intermediate_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_11_modules_intermediate_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_11_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_11_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_11_modules_output_modules_dropout_p: torch.Tensor, + L_self_modules_encoder_modules_layer_modules_11_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_11_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_pooler_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_pooler_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + ): + l_input_ids_ = L_input_ids_ + l_self_modules_embeddings_buffers_token_type_ids_ = ( + L_self_modules_embeddings_buffers_token_type_ids_ + ) + l_self_modules_embeddings_modules_word_embeddings_parameters_weight_ = ( + L_self_modules_embeddings_modules_word_embeddings_parameters_weight_ + ) + l_self_modules_embeddings_modules_token_type_embeddings_parameters_weight_ = ( + L_self_modules_embeddings_modules_token_type_embeddings_parameters_weight_ + ) + l_self_modules_embeddings_modules_position_embeddings_parameters_weight_ = ( + L_self_modules_embeddings_modules_position_embeddings_parameters_weight_ + ) + l_self_modules_embeddings_modules_layer_norm_parameters_weight_ = ( + L_self_modules_embeddings_modules_LayerNorm_parameters_weight_ + ) + l_self_modules_embeddings_modules_layer_norm_parameters_bias_ = ( + L_self_modules_embeddings_modules_LayerNorm_parameters_bias_ + ) + l_self_modules_embeddings_modules_dropout_p = ( + L_self_modules_embeddings_modules_dropout_p + ) + l_attention_mask_ = L_attention_mask_ + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_query_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_query_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_query_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_query_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_key_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_key_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_key_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_key_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_value_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_value_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_value_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_value_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dropout_p = L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dropout_p + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_0_modules_intermediate_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_0_modules_intermediate_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_0_modules_intermediate_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_0_modules_intermediate_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dropout_p = L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dropout_p + l_self_modules_encoder_modules_layer_modules_0_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_0_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_query_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_query_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_query_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_query_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_key_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_key_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_key_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_key_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_value_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_value_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_value_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_value_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dropout_p = L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dropout_p + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_1_modules_intermediate_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_1_modules_intermediate_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_1_modules_intermediate_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_1_modules_intermediate_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dropout_p = L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dropout_p + l_self_modules_encoder_modules_layer_modules_1_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_1_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_query_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_query_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_query_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_query_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_key_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_key_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_key_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_key_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_value_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_value_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_value_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_value_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_dropout_p = L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_dropout_p + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_2_modules_intermediate_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_2_modules_intermediate_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_2_modules_intermediate_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_2_modules_intermediate_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_2_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_2_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_2_modules_output_modules_dropout_p = L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_dropout_p + l_self_modules_encoder_modules_layer_modules_2_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_2_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_query_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_query_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_query_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_query_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_key_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_key_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_key_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_key_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_value_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_value_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_value_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_value_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_dropout_p = L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_dropout_p + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_3_modules_intermediate_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_3_modules_intermediate_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_3_modules_intermediate_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_3_modules_intermediate_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_3_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_3_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_3_modules_output_modules_dropout_p = L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_dropout_p + l_self_modules_encoder_modules_layer_modules_3_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_3_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_query_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_query_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_query_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_query_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_key_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_key_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_key_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_key_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_value_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_value_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_value_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_value_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_dropout_p = L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_dropout_p + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_4_modules_intermediate_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_4_modules_intermediate_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_4_modules_intermediate_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_4_modules_intermediate_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_4_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_4_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_4_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_4_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_4_modules_output_modules_dropout_p = L_self_modules_encoder_modules_layer_modules_4_modules_output_modules_dropout_p + l_self_modules_encoder_modules_layer_modules_4_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_4_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_4_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_4_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_query_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_query_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_query_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_query_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_key_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_key_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_key_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_key_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_value_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_value_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_value_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_value_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_dropout_p = L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_dropout_p + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_5_modules_intermediate_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_5_modules_intermediate_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_5_modules_intermediate_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_5_modules_intermediate_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_5_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_5_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_5_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_5_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_5_modules_output_modules_dropout_p = L_self_modules_encoder_modules_layer_modules_5_modules_output_modules_dropout_p + l_self_modules_encoder_modules_layer_modules_5_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_5_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_5_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_5_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_query_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_query_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_query_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_query_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_key_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_key_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_key_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_key_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_value_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_value_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_value_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_value_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_dropout_p = L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_dropout_p + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_6_modules_intermediate_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_6_modules_intermediate_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_6_modules_intermediate_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_6_modules_intermediate_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_6_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_6_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_6_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_6_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_6_modules_output_modules_dropout_p = L_self_modules_encoder_modules_layer_modules_6_modules_output_modules_dropout_p + l_self_modules_encoder_modules_layer_modules_6_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_6_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_6_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_6_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_query_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_query_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_query_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_query_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_key_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_key_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_key_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_key_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_value_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_value_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_value_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_value_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_dropout_p = L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_dropout_p + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_7_modules_intermediate_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_7_modules_intermediate_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_7_modules_intermediate_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_7_modules_intermediate_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_7_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_7_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_7_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_7_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_7_modules_output_modules_dropout_p = L_self_modules_encoder_modules_layer_modules_7_modules_output_modules_dropout_p + l_self_modules_encoder_modules_layer_modules_7_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_7_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_7_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_7_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_query_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_query_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_query_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_query_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_key_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_key_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_key_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_key_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_value_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_value_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_value_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_value_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_dropout_p = L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_dropout_p + l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_8_modules_intermediate_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_8_modules_intermediate_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_8_modules_intermediate_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_8_modules_intermediate_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_8_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_8_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_8_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_8_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_8_modules_output_modules_dropout_p = L_self_modules_encoder_modules_layer_modules_8_modules_output_modules_dropout_p + l_self_modules_encoder_modules_layer_modules_8_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_8_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_8_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_8_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_query_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_query_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_query_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_query_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_key_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_key_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_key_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_key_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_value_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_value_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_value_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_value_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_dropout_p = L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_dropout_p + l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_9_modules_intermediate_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_9_modules_intermediate_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_9_modules_intermediate_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_9_modules_intermediate_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_9_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_9_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_9_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_9_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_9_modules_output_modules_dropout_p = L_self_modules_encoder_modules_layer_modules_9_modules_output_modules_dropout_p + l_self_modules_encoder_modules_layer_modules_9_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_9_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_9_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_9_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_query_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_query_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_query_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_query_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_key_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_key_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_key_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_key_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_value_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_value_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_value_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_value_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_dropout_p = L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_dropout_p + l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_10_modules_intermediate_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_10_modules_intermediate_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_10_modules_intermediate_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_10_modules_intermediate_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_10_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_10_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_10_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_10_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_10_modules_output_modules_dropout_p = L_self_modules_encoder_modules_layer_modules_10_modules_output_modules_dropout_p + l_self_modules_encoder_modules_layer_modules_10_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_10_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_10_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_10_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_query_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_query_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_query_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_query_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_key_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_key_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_key_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_key_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_value_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_value_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_value_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_value_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_dropout_p = L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_dropout_p + l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_11_modules_intermediate_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_11_modules_intermediate_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_11_modules_intermediate_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_11_modules_intermediate_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_11_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_11_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_11_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_11_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_11_modules_output_modules_dropout_p = L_self_modules_encoder_modules_layer_modules_11_modules_output_modules_dropout_p + l_self_modules_encoder_modules_layer_modules_11_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_11_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_11_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_11_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_pooler_modules_dense_parameters_weight_ = ( + L_self_modules_pooler_modules_dense_parameters_weight_ + ) + l_self_modules_pooler_modules_dense_parameters_bias_ = ( + L_self_modules_pooler_modules_dense_parameters_bias_ + ) + buffered_token_type_ids = l_self_modules_embeddings_buffers_token_type_ids_[ + (slice(None, None, None), slice(None, 11, None)) + ] + l_self_modules_embeddings_buffers_token_type_ids_ = None + buffered_token_type_ids_expanded = buffered_token_type_ids.expand(1, 11) + buffered_token_type_ids = None + ne = l_input_ids_.ne(1) + mask = ne.int() + ne = None + cumsum = torch.cumsum(mask, dim=1) + type_as = cumsum.type_as(mask) + cumsum = None + add = type_as + 0 + type_as = None + incremental_indices = add * mask + add = mask = None + long = incremental_indices.long() + incremental_indices = None + position_ids = long + 1 + long = None + inputs_embeds = torch.nn.functional.embedding( + l_input_ids_, + l_self_modules_embeddings_modules_word_embeddings_parameters_weight_, + 1, + None, + 2.0, + False, + False, + ) + l_input_ids_ = ( + l_self_modules_embeddings_modules_word_embeddings_parameters_weight_ + ) = None + token_type_embeddings = torch.nn.functional.embedding( + buffered_token_type_ids_expanded, + l_self_modules_embeddings_modules_token_type_embeddings_parameters_weight_, + None, + None, + 2.0, + False, + False, + ) + buffered_token_type_ids_expanded = ( + l_self_modules_embeddings_modules_token_type_embeddings_parameters_weight_ + ) = None + embeddings = inputs_embeds + token_type_embeddings + inputs_embeds = token_type_embeddings = None + position_embeddings = torch.nn.functional.embedding( + position_ids, + l_self_modules_embeddings_modules_position_embeddings_parameters_weight_, + 1, + None, + 2.0, + False, + False, + ) + position_ids = ( + l_self_modules_embeddings_modules_position_embeddings_parameters_weight_ + ) = None + embeddings += position_embeddings + embeddings_1 = embeddings + embeddings = position_embeddings = None + embeddings_2 = torch.nn.functional.layer_norm( + embeddings_1, + (768,), + l_self_modules_embeddings_modules_layer_norm_parameters_weight_, + l_self_modules_embeddings_modules_layer_norm_parameters_bias_, + 1e-05, + ) + embeddings_1 = ( + l_self_modules_embeddings_modules_layer_norm_parameters_weight_ + ) = l_self_modules_embeddings_modules_layer_norm_parameters_bias_ = None + item = l_self_modules_embeddings_modules_dropout_p.item() + l_self_modules_embeddings_modules_dropout_p = None + embeddings_3 = torch.nn.functional.dropout(embeddings_2, item, False, False) + embeddings_2 = item = None + getitem_1 = l_attention_mask_[ + (slice(None, None, None), None, None, slice(None, None, None)) + ] + l_attention_mask_ = None + expand_1 = getitem_1.expand(1, 1, 11, 11) + getitem_1 = None + expanded_mask = expand_1.to(torch.float32) + expand_1 = None + tensor = torch.tensor(1.0, dtype=torch.float32) + inverted_mask = tensor - expanded_mask + tensor = expanded_mask = None + to_1 = inverted_mask.to(torch.bool) + extended_attention_mask = inverted_mask.masked_fill( + to_1, -3.4028234663852886e38 + ) + inverted_mask = to_1 = None + linear = torch._C._nn.linear( + embeddings_3, + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_query_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_query_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_query_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_query_parameters_bias_ = (None) + view = linear.view(1, -1, 12, 64) + linear = None + query_layer = view.transpose(1, 2) + view = None + linear_1 = torch._C._nn.linear( + embeddings_3, + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_key_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_key_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_key_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_key_parameters_bias_ = (None) + view_1 = linear_1.view(1, -1, 12, 64) + linear_1 = None + key_layer = view_1.transpose(1, 2) + view_1 = None + linear_2 = torch._C._nn.linear( + embeddings_3, + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_value_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_value_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_value_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_value_parameters_bias_ = (None) + view_2 = linear_2.view(1, -1, 12, 64) + linear_2 = None + value_layer = view_2.transpose(1, 2) + view_2 = None + attn_output = torch._C._nn.scaled_dot_product_attention( + query_layer, + key_layer, + value_layer, + attn_mask=extended_attention_mask, + dropout_p=0.0, + is_causal=False, + ) + query_layer = key_layer = value_layer = None + attn_output_1 = attn_output.transpose(1, 2) + attn_output = None + attn_output_2 = attn_output_1.reshape(1, 11, 768) + attn_output_1 = None + hidden_states = torch._C._nn.linear( + attn_output_2, + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dense_parameters_bias_, + ) + attn_output_2 = l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dense_parameters_bias_ = (None) + item_1 = ( + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dropout_p.item() + ) + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dropout_p = ( + None + ) + hidden_states_1 = torch.nn.functional.dropout( + hidden_states, item_1, False, False + ) + hidden_states = item_1 = None + add_3 = hidden_states_1 + embeddings_3 + hidden_states_1 = embeddings_3 = None + hidden_states_2 = torch.nn.functional.layer_norm( + add_3, + (768,), + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_layer_norm_parameters_bias_, + 1e-05, + ) + add_3 = l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = (None) + hidden_states_3 = torch._C._nn.linear( + hidden_states_2, + l_self_modules_encoder_modules_layer_modules_0_modules_intermediate_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_0_modules_intermediate_modules_dense_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_0_modules_intermediate_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_0_modules_intermediate_modules_dense_parameters_bias_ = (None) + hidden_states_4 = torch._C._nn.gelu(hidden_states_3) + hidden_states_3 = None + hidden_states_5 = torch._C._nn.linear( + hidden_states_4, + l_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dense_parameters_bias_, + ) + hidden_states_4 = l_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dense_parameters_bias_ = (None) + item_2 = ( + l_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dropout_p.item() + ) + l_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dropout_p = ( + None + ) + hidden_states_6 = torch.nn.functional.dropout( + hidden_states_5, item_2, False, False + ) + hidden_states_5 = item_2 = None + add_4 = hidden_states_6 + hidden_states_2 + hidden_states_6 = hidden_states_2 = None + hidden_states_7 = torch.nn.functional.layer_norm( + add_4, + (768,), + l_self_modules_encoder_modules_layer_modules_0_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_0_modules_output_modules_layer_norm_parameters_bias_, + 1e-05, + ) + add_4 = l_self_modules_encoder_modules_layer_modules_0_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_0_modules_output_modules_layer_norm_parameters_bias_ = (None) + linear_6 = torch._C._nn.linear( + hidden_states_7, + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_query_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_query_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_query_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_query_parameters_bias_ = (None) + view_3 = linear_6.view(1, -1, 12, 64) + linear_6 = None + query_layer_1 = view_3.transpose(1, 2) + view_3 = None + linear_7 = torch._C._nn.linear( + hidden_states_7, + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_key_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_key_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_key_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_key_parameters_bias_ = (None) + view_4 = linear_7.view(1, -1, 12, 64) + linear_7 = None + key_layer_1 = view_4.transpose(1, 2) + view_4 = None + linear_8 = torch._C._nn.linear( + hidden_states_7, + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_value_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_value_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_value_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_value_parameters_bias_ = (None) + view_5 = linear_8.view(1, -1, 12, 64) + linear_8 = None + value_layer_1 = view_5.transpose(1, 2) + view_5 = None + attn_output_3 = torch._C._nn.scaled_dot_product_attention( + query_layer_1, + key_layer_1, + value_layer_1, + attn_mask=extended_attention_mask, + dropout_p=0.0, + is_causal=False, + ) + query_layer_1 = key_layer_1 = value_layer_1 = None + attn_output_4 = attn_output_3.transpose(1, 2) + attn_output_3 = None + attn_output_5 = attn_output_4.reshape(1, 11, 768) + attn_output_4 = None + hidden_states_8 = torch._C._nn.linear( + attn_output_5, + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dense_parameters_bias_, + ) + attn_output_5 = l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dense_parameters_bias_ = (None) + item_3 = ( + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dropout_p.item() + ) + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dropout_p = ( + None + ) + hidden_states_9 = torch.nn.functional.dropout( + hidden_states_8, item_3, False, False + ) + hidden_states_8 = item_3 = None + add_5 = hidden_states_9 + hidden_states_7 + hidden_states_9 = hidden_states_7 = None + hidden_states_10 = torch.nn.functional.layer_norm( + add_5, + (768,), + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_layer_norm_parameters_bias_, + 1e-05, + ) + add_5 = l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = (None) + hidden_states_11 = torch._C._nn.linear( + hidden_states_10, + l_self_modules_encoder_modules_layer_modules_1_modules_intermediate_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_1_modules_intermediate_modules_dense_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_1_modules_intermediate_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_1_modules_intermediate_modules_dense_parameters_bias_ = (None) + hidden_states_12 = torch._C._nn.gelu(hidden_states_11) + hidden_states_11 = None + hidden_states_13 = torch._C._nn.linear( + hidden_states_12, + l_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dense_parameters_bias_, + ) + hidden_states_12 = l_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dense_parameters_bias_ = (None) + item_4 = ( + l_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dropout_p.item() + ) + l_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dropout_p = ( + None + ) + hidden_states_14 = torch.nn.functional.dropout( + hidden_states_13, item_4, False, False + ) + hidden_states_13 = item_4 = None + add_6 = hidden_states_14 + hidden_states_10 + hidden_states_14 = hidden_states_10 = None + hidden_states_15 = torch.nn.functional.layer_norm( + add_6, + (768,), + l_self_modules_encoder_modules_layer_modules_1_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_1_modules_output_modules_layer_norm_parameters_bias_, + 1e-05, + ) + add_6 = l_self_modules_encoder_modules_layer_modules_1_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_1_modules_output_modules_layer_norm_parameters_bias_ = (None) + linear_12 = torch._C._nn.linear( + hidden_states_15, + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_query_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_query_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_query_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_query_parameters_bias_ = (None) + view_6 = linear_12.view(1, -1, 12, 64) + linear_12 = None + query_layer_2 = view_6.transpose(1, 2) + view_6 = None + linear_13 = torch._C._nn.linear( + hidden_states_15, + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_key_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_key_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_key_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_key_parameters_bias_ = (None) + view_7 = linear_13.view(1, -1, 12, 64) + linear_13 = None + key_layer_2 = view_7.transpose(1, 2) + view_7 = None + linear_14 = torch._C._nn.linear( + hidden_states_15, + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_value_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_value_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_value_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_value_parameters_bias_ = (None) + view_8 = linear_14.view(1, -1, 12, 64) + linear_14 = None + value_layer_2 = view_8.transpose(1, 2) + view_8 = None + attn_output_6 = torch._C._nn.scaled_dot_product_attention( + query_layer_2, + key_layer_2, + value_layer_2, + attn_mask=extended_attention_mask, + dropout_p=0.0, + is_causal=False, + ) + query_layer_2 = key_layer_2 = value_layer_2 = None + attn_output_7 = attn_output_6.transpose(1, 2) + attn_output_6 = None + attn_output_8 = attn_output_7.reshape(1, 11, 768) + attn_output_7 = None + hidden_states_16 = torch._C._nn.linear( + attn_output_8, + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_dense_parameters_bias_, + ) + attn_output_8 = l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_dense_parameters_bias_ = (None) + item_5 = ( + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_dropout_p.item() + ) + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_dropout_p = ( + None + ) + hidden_states_17 = torch.nn.functional.dropout( + hidden_states_16, item_5, False, False + ) + hidden_states_16 = item_5 = None + add_7 = hidden_states_17 + hidden_states_15 + hidden_states_17 = hidden_states_15 = None + hidden_states_18 = torch.nn.functional.layer_norm( + add_7, + (768,), + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_layer_norm_parameters_bias_, + 1e-05, + ) + add_7 = l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = (None) + hidden_states_19 = torch._C._nn.linear( + hidden_states_18, + l_self_modules_encoder_modules_layer_modules_2_modules_intermediate_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_2_modules_intermediate_modules_dense_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_2_modules_intermediate_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_2_modules_intermediate_modules_dense_parameters_bias_ = (None) + hidden_states_20 = torch._C._nn.gelu(hidden_states_19) + hidden_states_19 = None + hidden_states_21 = torch._C._nn.linear( + hidden_states_20, + l_self_modules_encoder_modules_layer_modules_2_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_2_modules_output_modules_dense_parameters_bias_, + ) + hidden_states_20 = l_self_modules_encoder_modules_layer_modules_2_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_2_modules_output_modules_dense_parameters_bias_ = (None) + item_6 = ( + l_self_modules_encoder_modules_layer_modules_2_modules_output_modules_dropout_p.item() + ) + l_self_modules_encoder_modules_layer_modules_2_modules_output_modules_dropout_p = ( + None + ) + hidden_states_22 = torch.nn.functional.dropout( + hidden_states_21, item_6, False, False + ) + hidden_states_21 = item_6 = None + add_8 = hidden_states_22 + hidden_states_18 + hidden_states_22 = hidden_states_18 = None + hidden_states_23 = torch.nn.functional.layer_norm( + add_8, + (768,), + l_self_modules_encoder_modules_layer_modules_2_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_2_modules_output_modules_layer_norm_parameters_bias_, + 1e-05, + ) + add_8 = l_self_modules_encoder_modules_layer_modules_2_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_2_modules_output_modules_layer_norm_parameters_bias_ = (None) + linear_18 = torch._C._nn.linear( + hidden_states_23, + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_query_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_query_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_query_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_query_parameters_bias_ = (None) + view_9 = linear_18.view(1, -1, 12, 64) + linear_18 = None + query_layer_3 = view_9.transpose(1, 2) + view_9 = None + linear_19 = torch._C._nn.linear( + hidden_states_23, + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_key_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_key_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_key_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_key_parameters_bias_ = (None) + view_10 = linear_19.view(1, -1, 12, 64) + linear_19 = None + key_layer_3 = view_10.transpose(1, 2) + view_10 = None + linear_20 = torch._C._nn.linear( + hidden_states_23, + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_value_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_value_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_value_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_value_parameters_bias_ = (None) + view_11 = linear_20.view(1, -1, 12, 64) + linear_20 = None + value_layer_3 = view_11.transpose(1, 2) + view_11 = None + attn_output_9 = torch._C._nn.scaled_dot_product_attention( + query_layer_3, + key_layer_3, + value_layer_3, + attn_mask=extended_attention_mask, + dropout_p=0.0, + is_causal=False, + ) + query_layer_3 = key_layer_3 = value_layer_3 = None + attn_output_10 = attn_output_9.transpose(1, 2) + attn_output_9 = None + attn_output_11 = attn_output_10.reshape(1, 11, 768) + attn_output_10 = None + hidden_states_24 = torch._C._nn.linear( + attn_output_11, + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_dense_parameters_bias_, + ) + attn_output_11 = l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_dense_parameters_bias_ = (None) + item_7 = ( + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_dropout_p.item() + ) + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_dropout_p = ( + None + ) + hidden_states_25 = torch.nn.functional.dropout( + hidden_states_24, item_7, False, False + ) + hidden_states_24 = item_7 = None + add_9 = hidden_states_25 + hidden_states_23 + hidden_states_25 = hidden_states_23 = None + hidden_states_26 = torch.nn.functional.layer_norm( + add_9, + (768,), + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_layer_norm_parameters_bias_, + 1e-05, + ) + add_9 = l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = (None) + hidden_states_27 = torch._C._nn.linear( + hidden_states_26, + l_self_modules_encoder_modules_layer_modules_3_modules_intermediate_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_3_modules_intermediate_modules_dense_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_3_modules_intermediate_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_3_modules_intermediate_modules_dense_parameters_bias_ = (None) + hidden_states_28 = torch._C._nn.gelu(hidden_states_27) + hidden_states_27 = None + hidden_states_29 = torch._C._nn.linear( + hidden_states_28, + l_self_modules_encoder_modules_layer_modules_3_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_3_modules_output_modules_dense_parameters_bias_, + ) + hidden_states_28 = l_self_modules_encoder_modules_layer_modules_3_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_3_modules_output_modules_dense_parameters_bias_ = (None) + item_8 = ( + l_self_modules_encoder_modules_layer_modules_3_modules_output_modules_dropout_p.item() + ) + l_self_modules_encoder_modules_layer_modules_3_modules_output_modules_dropout_p = ( + None + ) + hidden_states_30 = torch.nn.functional.dropout( + hidden_states_29, item_8, False, False + ) + hidden_states_29 = item_8 = None + add_10 = hidden_states_30 + hidden_states_26 + hidden_states_30 = hidden_states_26 = None + hidden_states_31 = torch.nn.functional.layer_norm( + add_10, + (768,), + l_self_modules_encoder_modules_layer_modules_3_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_3_modules_output_modules_layer_norm_parameters_bias_, + 1e-05, + ) + add_10 = l_self_modules_encoder_modules_layer_modules_3_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_3_modules_output_modules_layer_norm_parameters_bias_ = (None) + linear_24 = torch._C._nn.linear( + hidden_states_31, + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_query_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_query_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_query_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_query_parameters_bias_ = (None) + view_12 = linear_24.view(1, -1, 12, 64) + linear_24 = None + query_layer_4 = view_12.transpose(1, 2) + view_12 = None + linear_25 = torch._C._nn.linear( + hidden_states_31, + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_key_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_key_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_key_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_key_parameters_bias_ = (None) + view_13 = linear_25.view(1, -1, 12, 64) + linear_25 = None + key_layer_4 = view_13.transpose(1, 2) + view_13 = None + linear_26 = torch._C._nn.linear( + hidden_states_31, + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_value_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_value_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_value_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_value_parameters_bias_ = (None) + view_14 = linear_26.view(1, -1, 12, 64) + linear_26 = None + value_layer_4 = view_14.transpose(1, 2) + view_14 = None + attn_output_12 = torch._C._nn.scaled_dot_product_attention( + query_layer_4, + key_layer_4, + value_layer_4, + attn_mask=extended_attention_mask, + dropout_p=0.0, + is_causal=False, + ) + query_layer_4 = key_layer_4 = value_layer_4 = None + attn_output_13 = attn_output_12.transpose(1, 2) + attn_output_12 = None + attn_output_14 = attn_output_13.reshape(1, 11, 768) + attn_output_13 = None + hidden_states_32 = torch._C._nn.linear( + attn_output_14, + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_dense_parameters_bias_, + ) + attn_output_14 = l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_dense_parameters_bias_ = (None) + item_9 = ( + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_dropout_p.item() + ) + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_dropout_p = ( + None + ) + hidden_states_33 = torch.nn.functional.dropout( + hidden_states_32, item_9, False, False + ) + hidden_states_32 = item_9 = None + add_11 = hidden_states_33 + hidden_states_31 + hidden_states_33 = hidden_states_31 = None + hidden_states_34 = torch.nn.functional.layer_norm( + add_11, + (768,), + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_layer_norm_parameters_bias_, + 1e-05, + ) + add_11 = l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = (None) + hidden_states_35 = torch._C._nn.linear( + hidden_states_34, + l_self_modules_encoder_modules_layer_modules_4_modules_intermediate_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_4_modules_intermediate_modules_dense_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_4_modules_intermediate_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_4_modules_intermediate_modules_dense_parameters_bias_ = (None) + hidden_states_36 = torch._C._nn.gelu(hidden_states_35) + hidden_states_35 = None + hidden_states_37 = torch._C._nn.linear( + hidden_states_36, + l_self_modules_encoder_modules_layer_modules_4_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_4_modules_output_modules_dense_parameters_bias_, + ) + hidden_states_36 = l_self_modules_encoder_modules_layer_modules_4_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_4_modules_output_modules_dense_parameters_bias_ = (None) + item_10 = ( + l_self_modules_encoder_modules_layer_modules_4_modules_output_modules_dropout_p.item() + ) + l_self_modules_encoder_modules_layer_modules_4_modules_output_modules_dropout_p = ( + None + ) + hidden_states_38 = torch.nn.functional.dropout( + hidden_states_37, item_10, False, False + ) + hidden_states_37 = item_10 = None + add_12 = hidden_states_38 + hidden_states_34 + hidden_states_38 = hidden_states_34 = None + hidden_states_39 = torch.nn.functional.layer_norm( + add_12, + (768,), + l_self_modules_encoder_modules_layer_modules_4_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_4_modules_output_modules_layer_norm_parameters_bias_, + 1e-05, + ) + add_12 = l_self_modules_encoder_modules_layer_modules_4_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_4_modules_output_modules_layer_norm_parameters_bias_ = (None) + linear_30 = torch._C._nn.linear( + hidden_states_39, + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_query_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_query_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_query_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_query_parameters_bias_ = (None) + view_15 = linear_30.view(1, -1, 12, 64) + linear_30 = None + query_layer_5 = view_15.transpose(1, 2) + view_15 = None + linear_31 = torch._C._nn.linear( + hidden_states_39, + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_key_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_key_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_key_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_key_parameters_bias_ = (None) + view_16 = linear_31.view(1, -1, 12, 64) + linear_31 = None + key_layer_5 = view_16.transpose(1, 2) + view_16 = None + linear_32 = torch._C._nn.linear( + hidden_states_39, + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_value_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_value_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_value_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_value_parameters_bias_ = (None) + view_17 = linear_32.view(1, -1, 12, 64) + linear_32 = None + value_layer_5 = view_17.transpose(1, 2) + view_17 = None + attn_output_15 = torch._C._nn.scaled_dot_product_attention( + query_layer_5, + key_layer_5, + value_layer_5, + attn_mask=extended_attention_mask, + dropout_p=0.0, + is_causal=False, + ) + query_layer_5 = key_layer_5 = value_layer_5 = None + attn_output_16 = attn_output_15.transpose(1, 2) + attn_output_15 = None + attn_output_17 = attn_output_16.reshape(1, 11, 768) + attn_output_16 = None + hidden_states_40 = torch._C._nn.linear( + attn_output_17, + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_dense_parameters_bias_, + ) + attn_output_17 = l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_dense_parameters_bias_ = (None) + item_11 = ( + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_dropout_p.item() + ) + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_dropout_p = ( + None + ) + hidden_states_41 = torch.nn.functional.dropout( + hidden_states_40, item_11, False, False + ) + hidden_states_40 = item_11 = None + add_13 = hidden_states_41 + hidden_states_39 + hidden_states_41 = hidden_states_39 = None + hidden_states_42 = torch.nn.functional.layer_norm( + add_13, + (768,), + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_layer_norm_parameters_bias_, + 1e-05, + ) + add_13 = l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = (None) + hidden_states_43 = torch._C._nn.linear( + hidden_states_42, + l_self_modules_encoder_modules_layer_modules_5_modules_intermediate_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_5_modules_intermediate_modules_dense_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_5_modules_intermediate_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_5_modules_intermediate_modules_dense_parameters_bias_ = (None) + hidden_states_44 = torch._C._nn.gelu(hidden_states_43) + hidden_states_43 = None + hidden_states_45 = torch._C._nn.linear( + hidden_states_44, + l_self_modules_encoder_modules_layer_modules_5_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_5_modules_output_modules_dense_parameters_bias_, + ) + hidden_states_44 = l_self_modules_encoder_modules_layer_modules_5_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_5_modules_output_modules_dense_parameters_bias_ = (None) + item_12 = ( + l_self_modules_encoder_modules_layer_modules_5_modules_output_modules_dropout_p.item() + ) + l_self_modules_encoder_modules_layer_modules_5_modules_output_modules_dropout_p = ( + None + ) + hidden_states_46 = torch.nn.functional.dropout( + hidden_states_45, item_12, False, False + ) + hidden_states_45 = item_12 = None + add_14 = hidden_states_46 + hidden_states_42 + hidden_states_46 = hidden_states_42 = None + hidden_states_47 = torch.nn.functional.layer_norm( + add_14, + (768,), + l_self_modules_encoder_modules_layer_modules_5_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_5_modules_output_modules_layer_norm_parameters_bias_, + 1e-05, + ) + add_14 = l_self_modules_encoder_modules_layer_modules_5_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_5_modules_output_modules_layer_norm_parameters_bias_ = (None) + linear_36 = torch._C._nn.linear( + hidden_states_47, + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_query_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_query_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_query_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_query_parameters_bias_ = (None) + view_18 = linear_36.view(1, -1, 12, 64) + linear_36 = None + query_layer_6 = view_18.transpose(1, 2) + view_18 = None + linear_37 = torch._C._nn.linear( + hidden_states_47, + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_key_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_key_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_key_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_key_parameters_bias_ = (None) + view_19 = linear_37.view(1, -1, 12, 64) + linear_37 = None + key_layer_6 = view_19.transpose(1, 2) + view_19 = None + linear_38 = torch._C._nn.linear( + hidden_states_47, + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_value_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_value_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_value_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_value_parameters_bias_ = (None) + view_20 = linear_38.view(1, -1, 12, 64) + linear_38 = None + value_layer_6 = view_20.transpose(1, 2) + view_20 = None + attn_output_18 = torch._C._nn.scaled_dot_product_attention( + query_layer_6, + key_layer_6, + value_layer_6, + attn_mask=extended_attention_mask, + dropout_p=0.0, + is_causal=False, + ) + query_layer_6 = key_layer_6 = value_layer_6 = None + attn_output_19 = attn_output_18.transpose(1, 2) + attn_output_18 = None + attn_output_20 = attn_output_19.reshape(1, 11, 768) + attn_output_19 = None + hidden_states_48 = torch._C._nn.linear( + attn_output_20, + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_dense_parameters_bias_, + ) + attn_output_20 = l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_dense_parameters_bias_ = (None) + item_13 = ( + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_dropout_p.item() + ) + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_dropout_p = ( + None + ) + hidden_states_49 = torch.nn.functional.dropout( + hidden_states_48, item_13, False, False + ) + hidden_states_48 = item_13 = None + add_15 = hidden_states_49 + hidden_states_47 + hidden_states_49 = hidden_states_47 = None + hidden_states_50 = torch.nn.functional.layer_norm( + add_15, + (768,), + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_layer_norm_parameters_bias_, + 1e-05, + ) + add_15 = l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = (None) + hidden_states_51 = torch._C._nn.linear( + hidden_states_50, + l_self_modules_encoder_modules_layer_modules_6_modules_intermediate_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_6_modules_intermediate_modules_dense_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_6_modules_intermediate_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_6_modules_intermediate_modules_dense_parameters_bias_ = (None) + hidden_states_52 = torch._C._nn.gelu(hidden_states_51) + hidden_states_51 = None + hidden_states_53 = torch._C._nn.linear( + hidden_states_52, + l_self_modules_encoder_modules_layer_modules_6_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_6_modules_output_modules_dense_parameters_bias_, + ) + hidden_states_52 = l_self_modules_encoder_modules_layer_modules_6_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_6_modules_output_modules_dense_parameters_bias_ = (None) + item_14 = ( + l_self_modules_encoder_modules_layer_modules_6_modules_output_modules_dropout_p.item() + ) + l_self_modules_encoder_modules_layer_modules_6_modules_output_modules_dropout_p = ( + None + ) + hidden_states_54 = torch.nn.functional.dropout( + hidden_states_53, item_14, False, False + ) + hidden_states_53 = item_14 = None + add_16 = hidden_states_54 + hidden_states_50 + hidden_states_54 = hidden_states_50 = None + hidden_states_55 = torch.nn.functional.layer_norm( + add_16, + (768,), + l_self_modules_encoder_modules_layer_modules_6_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_6_modules_output_modules_layer_norm_parameters_bias_, + 1e-05, + ) + add_16 = l_self_modules_encoder_modules_layer_modules_6_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_6_modules_output_modules_layer_norm_parameters_bias_ = (None) + linear_42 = torch._C._nn.linear( + hidden_states_55, + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_query_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_query_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_query_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_query_parameters_bias_ = (None) + view_21 = linear_42.view(1, -1, 12, 64) + linear_42 = None + query_layer_7 = view_21.transpose(1, 2) + view_21 = None + linear_43 = torch._C._nn.linear( + hidden_states_55, + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_key_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_key_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_key_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_key_parameters_bias_ = (None) + view_22 = linear_43.view(1, -1, 12, 64) + linear_43 = None + key_layer_7 = view_22.transpose(1, 2) + view_22 = None + linear_44 = torch._C._nn.linear( + hidden_states_55, + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_value_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_value_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_value_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_value_parameters_bias_ = (None) + view_23 = linear_44.view(1, -1, 12, 64) + linear_44 = None + value_layer_7 = view_23.transpose(1, 2) + view_23 = None + attn_output_21 = torch._C._nn.scaled_dot_product_attention( + query_layer_7, + key_layer_7, + value_layer_7, + attn_mask=extended_attention_mask, + dropout_p=0.0, + is_causal=False, + ) + query_layer_7 = key_layer_7 = value_layer_7 = None + attn_output_22 = attn_output_21.transpose(1, 2) + attn_output_21 = None + attn_output_23 = attn_output_22.reshape(1, 11, 768) + attn_output_22 = None + hidden_states_56 = torch._C._nn.linear( + attn_output_23, + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_dense_parameters_bias_, + ) + attn_output_23 = l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_dense_parameters_bias_ = (None) + item_15 = ( + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_dropout_p.item() + ) + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_dropout_p = ( + None + ) + hidden_states_57 = torch.nn.functional.dropout( + hidden_states_56, item_15, False, False + ) + hidden_states_56 = item_15 = None + add_17 = hidden_states_57 + hidden_states_55 + hidden_states_57 = hidden_states_55 = None + hidden_states_58 = torch.nn.functional.layer_norm( + add_17, + (768,), + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_layer_norm_parameters_bias_, + 1e-05, + ) + add_17 = l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = (None) + hidden_states_59 = torch._C._nn.linear( + hidden_states_58, + l_self_modules_encoder_modules_layer_modules_7_modules_intermediate_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_7_modules_intermediate_modules_dense_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_7_modules_intermediate_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_7_modules_intermediate_modules_dense_parameters_bias_ = (None) + hidden_states_60 = torch._C._nn.gelu(hidden_states_59) + hidden_states_59 = None + hidden_states_61 = torch._C._nn.linear( + hidden_states_60, + l_self_modules_encoder_modules_layer_modules_7_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_7_modules_output_modules_dense_parameters_bias_, + ) + hidden_states_60 = l_self_modules_encoder_modules_layer_modules_7_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_7_modules_output_modules_dense_parameters_bias_ = (None) + item_16 = ( + l_self_modules_encoder_modules_layer_modules_7_modules_output_modules_dropout_p.item() + ) + l_self_modules_encoder_modules_layer_modules_7_modules_output_modules_dropout_p = ( + None + ) + hidden_states_62 = torch.nn.functional.dropout( + hidden_states_61, item_16, False, False + ) + hidden_states_61 = item_16 = None + add_18 = hidden_states_62 + hidden_states_58 + hidden_states_62 = hidden_states_58 = None + hidden_states_63 = torch.nn.functional.layer_norm( + add_18, + (768,), + l_self_modules_encoder_modules_layer_modules_7_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_7_modules_output_modules_layer_norm_parameters_bias_, + 1e-05, + ) + add_18 = l_self_modules_encoder_modules_layer_modules_7_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_7_modules_output_modules_layer_norm_parameters_bias_ = (None) + linear_48 = torch._C._nn.linear( + hidden_states_63, + l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_query_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_query_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_query_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_query_parameters_bias_ = (None) + view_24 = linear_48.view(1, -1, 12, 64) + linear_48 = None + query_layer_8 = view_24.transpose(1, 2) + view_24 = None + linear_49 = torch._C._nn.linear( + hidden_states_63, + l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_key_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_key_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_key_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_key_parameters_bias_ = (None) + view_25 = linear_49.view(1, -1, 12, 64) + linear_49 = None + key_layer_8 = view_25.transpose(1, 2) + view_25 = None + linear_50 = torch._C._nn.linear( + hidden_states_63, + l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_value_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_value_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_value_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_value_parameters_bias_ = (None) + view_26 = linear_50.view(1, -1, 12, 64) + linear_50 = None + value_layer_8 = view_26.transpose(1, 2) + view_26 = None + attn_output_24 = torch._C._nn.scaled_dot_product_attention( + query_layer_8, + key_layer_8, + value_layer_8, + attn_mask=extended_attention_mask, + dropout_p=0.0, + is_causal=False, + ) + query_layer_8 = key_layer_8 = value_layer_8 = None + attn_output_25 = attn_output_24.transpose(1, 2) + attn_output_24 = None + attn_output_26 = attn_output_25.reshape(1, 11, 768) + attn_output_25 = None + hidden_states_64 = torch._C._nn.linear( + attn_output_26, + l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_dense_parameters_bias_, + ) + attn_output_26 = l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_dense_parameters_bias_ = (None) + item_17 = ( + l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_dropout_p.item() + ) + l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_dropout_p = ( + None + ) + hidden_states_65 = torch.nn.functional.dropout( + hidden_states_64, item_17, False, False + ) + hidden_states_64 = item_17 = None + add_19 = hidden_states_65 + hidden_states_63 + hidden_states_65 = hidden_states_63 = None + hidden_states_66 = torch.nn.functional.layer_norm( + add_19, + (768,), + l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_layer_norm_parameters_bias_, + 1e-05, + ) + add_19 = l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = (None) + hidden_states_67 = torch._C._nn.linear( + hidden_states_66, + l_self_modules_encoder_modules_layer_modules_8_modules_intermediate_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_8_modules_intermediate_modules_dense_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_8_modules_intermediate_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_8_modules_intermediate_modules_dense_parameters_bias_ = (None) + hidden_states_68 = torch._C._nn.gelu(hidden_states_67) + hidden_states_67 = None + hidden_states_69 = torch._C._nn.linear( + hidden_states_68, + l_self_modules_encoder_modules_layer_modules_8_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_8_modules_output_modules_dense_parameters_bias_, + ) + hidden_states_68 = l_self_modules_encoder_modules_layer_modules_8_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_8_modules_output_modules_dense_parameters_bias_ = (None) + item_18 = ( + l_self_modules_encoder_modules_layer_modules_8_modules_output_modules_dropout_p.item() + ) + l_self_modules_encoder_modules_layer_modules_8_modules_output_modules_dropout_p = ( + None + ) + hidden_states_70 = torch.nn.functional.dropout( + hidden_states_69, item_18, False, False + ) + hidden_states_69 = item_18 = None + add_20 = hidden_states_70 + hidden_states_66 + hidden_states_70 = hidden_states_66 = None + hidden_states_71 = torch.nn.functional.layer_norm( + add_20, + (768,), + l_self_modules_encoder_modules_layer_modules_8_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_8_modules_output_modules_layer_norm_parameters_bias_, + 1e-05, + ) + add_20 = l_self_modules_encoder_modules_layer_modules_8_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_8_modules_output_modules_layer_norm_parameters_bias_ = (None) + linear_54 = torch._C._nn.linear( + hidden_states_71, + l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_query_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_query_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_query_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_query_parameters_bias_ = (None) + view_27 = linear_54.view(1, -1, 12, 64) + linear_54 = None + query_layer_9 = view_27.transpose(1, 2) + view_27 = None + linear_55 = torch._C._nn.linear( + hidden_states_71, + l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_key_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_key_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_key_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_key_parameters_bias_ = (None) + view_28 = linear_55.view(1, -1, 12, 64) + linear_55 = None + key_layer_9 = view_28.transpose(1, 2) + view_28 = None + linear_56 = torch._C._nn.linear( + hidden_states_71, + l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_value_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_value_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_value_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_value_parameters_bias_ = (None) + view_29 = linear_56.view(1, -1, 12, 64) + linear_56 = None + value_layer_9 = view_29.transpose(1, 2) + view_29 = None + attn_output_27 = torch._C._nn.scaled_dot_product_attention( + query_layer_9, + key_layer_9, + value_layer_9, + attn_mask=extended_attention_mask, + dropout_p=0.0, + is_causal=False, + ) + query_layer_9 = key_layer_9 = value_layer_9 = None + attn_output_28 = attn_output_27.transpose(1, 2) + attn_output_27 = None + attn_output_29 = attn_output_28.reshape(1, 11, 768) + attn_output_28 = None + hidden_states_72 = torch._C._nn.linear( + attn_output_29, + l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_dense_parameters_bias_, + ) + attn_output_29 = l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_dense_parameters_bias_ = (None) + item_19 = ( + l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_dropout_p.item() + ) + l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_dropout_p = ( + None + ) + hidden_states_73 = torch.nn.functional.dropout( + hidden_states_72, item_19, False, False + ) + hidden_states_72 = item_19 = None + add_21 = hidden_states_73 + hidden_states_71 + hidden_states_73 = hidden_states_71 = None + hidden_states_74 = torch.nn.functional.layer_norm( + add_21, + (768,), + l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_layer_norm_parameters_bias_, + 1e-05, + ) + add_21 = l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = (None) + hidden_states_75 = torch._C._nn.linear( + hidden_states_74, + l_self_modules_encoder_modules_layer_modules_9_modules_intermediate_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_9_modules_intermediate_modules_dense_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_9_modules_intermediate_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_9_modules_intermediate_modules_dense_parameters_bias_ = (None) + hidden_states_76 = torch._C._nn.gelu(hidden_states_75) + hidden_states_75 = None + hidden_states_77 = torch._C._nn.linear( + hidden_states_76, + l_self_modules_encoder_modules_layer_modules_9_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_9_modules_output_modules_dense_parameters_bias_, + ) + hidden_states_76 = l_self_modules_encoder_modules_layer_modules_9_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_9_modules_output_modules_dense_parameters_bias_ = (None) + item_20 = ( + l_self_modules_encoder_modules_layer_modules_9_modules_output_modules_dropout_p.item() + ) + l_self_modules_encoder_modules_layer_modules_9_modules_output_modules_dropout_p = ( + None + ) + hidden_states_78 = torch.nn.functional.dropout( + hidden_states_77, item_20, False, False + ) + hidden_states_77 = item_20 = None + add_22 = hidden_states_78 + hidden_states_74 + hidden_states_78 = hidden_states_74 = None + hidden_states_79 = torch.nn.functional.layer_norm( + add_22, + (768,), + l_self_modules_encoder_modules_layer_modules_9_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_9_modules_output_modules_layer_norm_parameters_bias_, + 1e-05, + ) + add_22 = l_self_modules_encoder_modules_layer_modules_9_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_9_modules_output_modules_layer_norm_parameters_bias_ = (None) + linear_60 = torch._C._nn.linear( + hidden_states_79, + l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_query_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_query_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_query_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_query_parameters_bias_ = (None) + view_30 = linear_60.view(1, -1, 12, 64) + linear_60 = None + query_layer_10 = view_30.transpose(1, 2) + view_30 = None + linear_61 = torch._C._nn.linear( + hidden_states_79, + l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_key_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_key_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_key_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_key_parameters_bias_ = (None) + view_31 = linear_61.view(1, -1, 12, 64) + linear_61 = None + key_layer_10 = view_31.transpose(1, 2) + view_31 = None + linear_62 = torch._C._nn.linear( + hidden_states_79, + l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_value_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_value_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_value_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_value_parameters_bias_ = (None) + view_32 = linear_62.view(1, -1, 12, 64) + linear_62 = None + value_layer_10 = view_32.transpose(1, 2) + view_32 = None + attn_output_30 = torch._C._nn.scaled_dot_product_attention( + query_layer_10, + key_layer_10, + value_layer_10, + attn_mask=extended_attention_mask, + dropout_p=0.0, + is_causal=False, + ) + query_layer_10 = key_layer_10 = value_layer_10 = None + attn_output_31 = attn_output_30.transpose(1, 2) + attn_output_30 = None + attn_output_32 = attn_output_31.reshape(1, 11, 768) + attn_output_31 = None + hidden_states_80 = torch._C._nn.linear( + attn_output_32, + l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_dense_parameters_bias_, + ) + attn_output_32 = l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_dense_parameters_bias_ = (None) + item_21 = ( + l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_dropout_p.item() + ) + l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_dropout_p = ( + None + ) + hidden_states_81 = torch.nn.functional.dropout( + hidden_states_80, item_21, False, False + ) + hidden_states_80 = item_21 = None + add_23 = hidden_states_81 + hidden_states_79 + hidden_states_81 = hidden_states_79 = None + hidden_states_82 = torch.nn.functional.layer_norm( + add_23, + (768,), + l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_layer_norm_parameters_bias_, + 1e-05, + ) + add_23 = l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = (None) + hidden_states_83 = torch._C._nn.linear( + hidden_states_82, + l_self_modules_encoder_modules_layer_modules_10_modules_intermediate_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_10_modules_intermediate_modules_dense_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_10_modules_intermediate_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_10_modules_intermediate_modules_dense_parameters_bias_ = (None) + hidden_states_84 = torch._C._nn.gelu(hidden_states_83) + hidden_states_83 = None + hidden_states_85 = torch._C._nn.linear( + hidden_states_84, + l_self_modules_encoder_modules_layer_modules_10_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_10_modules_output_modules_dense_parameters_bias_, + ) + hidden_states_84 = l_self_modules_encoder_modules_layer_modules_10_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_10_modules_output_modules_dense_parameters_bias_ = (None) + item_22 = ( + l_self_modules_encoder_modules_layer_modules_10_modules_output_modules_dropout_p.item() + ) + l_self_modules_encoder_modules_layer_modules_10_modules_output_modules_dropout_p = ( + None + ) + hidden_states_86 = torch.nn.functional.dropout( + hidden_states_85, item_22, False, False + ) + hidden_states_85 = item_22 = None + add_24 = hidden_states_86 + hidden_states_82 + hidden_states_86 = hidden_states_82 = None + hidden_states_87 = torch.nn.functional.layer_norm( + add_24, + (768,), + l_self_modules_encoder_modules_layer_modules_10_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_10_modules_output_modules_layer_norm_parameters_bias_, + 1e-05, + ) + add_24 = l_self_modules_encoder_modules_layer_modules_10_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_10_modules_output_modules_layer_norm_parameters_bias_ = (None) + linear_66 = torch._C._nn.linear( + hidden_states_87, + l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_query_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_query_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_query_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_query_parameters_bias_ = (None) + view_33 = linear_66.view(1, -1, 12, 64) + linear_66 = None + query_layer_11 = view_33.transpose(1, 2) + view_33 = None + linear_67 = torch._C._nn.linear( + hidden_states_87, + l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_key_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_key_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_key_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_key_parameters_bias_ = (None) + view_34 = linear_67.view(1, -1, 12, 64) + linear_67 = None + key_layer_11 = view_34.transpose(1, 2) + view_34 = None + linear_68 = torch._C._nn.linear( + hidden_states_87, + l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_value_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_value_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_value_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_value_parameters_bias_ = (None) + view_35 = linear_68.view(1, -1, 12, 64) + linear_68 = None + value_layer_11 = view_35.transpose(1, 2) + view_35 = None + attn_output_33 = torch._C._nn.scaled_dot_product_attention( + query_layer_11, + key_layer_11, + value_layer_11, + attn_mask=extended_attention_mask, + dropout_p=0.0, + is_causal=False, + ) + query_layer_11 = key_layer_11 = value_layer_11 = extended_attention_mask = None + attn_output_34 = attn_output_33.transpose(1, 2) + attn_output_33 = None + attn_output_35 = attn_output_34.reshape(1, 11, 768) + attn_output_34 = None + hidden_states_88 = torch._C._nn.linear( + attn_output_35, + l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_dense_parameters_bias_, + ) + attn_output_35 = l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_dense_parameters_bias_ = (None) + item_23 = ( + l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_dropout_p.item() + ) + l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_dropout_p = ( + None + ) + hidden_states_89 = torch.nn.functional.dropout( + hidden_states_88, item_23, False, False + ) + hidden_states_88 = item_23 = None + add_25 = hidden_states_89 + hidden_states_87 + hidden_states_89 = hidden_states_87 = None + hidden_states_90 = torch.nn.functional.layer_norm( + add_25, + (768,), + l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_layer_norm_parameters_bias_, + 1e-05, + ) + add_25 = l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = (None) + hidden_states_91 = torch._C._nn.linear( + hidden_states_90, + l_self_modules_encoder_modules_layer_modules_11_modules_intermediate_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_11_modules_intermediate_modules_dense_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_11_modules_intermediate_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_11_modules_intermediate_modules_dense_parameters_bias_ = (None) + hidden_states_92 = torch._C._nn.gelu(hidden_states_91) + hidden_states_91 = None + hidden_states_93 = torch._C._nn.linear( + hidden_states_92, + l_self_modules_encoder_modules_layer_modules_11_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_11_modules_output_modules_dense_parameters_bias_, + ) + hidden_states_92 = l_self_modules_encoder_modules_layer_modules_11_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_11_modules_output_modules_dense_parameters_bias_ = (None) + item_24 = ( + l_self_modules_encoder_modules_layer_modules_11_modules_output_modules_dropout_p.item() + ) + l_self_modules_encoder_modules_layer_modules_11_modules_output_modules_dropout_p = ( + None + ) + hidden_states_94 = torch.nn.functional.dropout( + hidden_states_93, item_24, False, False + ) + hidden_states_93 = item_24 = None + add_26 = hidden_states_94 + hidden_states_90 + hidden_states_94 = hidden_states_90 = None + hidden_states_95 = torch.nn.functional.layer_norm( + add_26, + (768,), + l_self_modules_encoder_modules_layer_modules_11_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_11_modules_output_modules_layer_norm_parameters_bias_, + 1e-05, + ) + add_26 = l_self_modules_encoder_modules_layer_modules_11_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_11_modules_output_modules_layer_norm_parameters_bias_ = (None) + first_token_tensor = hidden_states_95[(slice(None, None, None), 0)] + pooled_output = torch._C._nn.linear( + first_token_tensor, + l_self_modules_pooler_modules_dense_parameters_weight_, + l_self_modules_pooler_modules_dense_parameters_bias_, + ) + first_token_tensor = ( + l_self_modules_pooler_modules_dense_parameters_weight_ + ) = l_self_modules_pooler_modules_dense_parameters_bias_ = None + pooled_output_1 = torch.tanh(pooled_output) + pooled_output = None + return (hidden_states_95, pooled_output_1) diff --git a/samples/transformers-auto-model/fusersam_Sentiment-Analysis-Model/weight_meta.py b/samples/transformers-auto-model/fusersam_Sentiment-Analysis-Model/weight_meta.py new file mode 100644 index 000000000..7be5d6be4 --- /dev/null +++ b/samples/transformers-auto-model/fusersam_Sentiment-Analysis-Model/weight_meta.py @@ -0,0 +1,2269 @@ +class Program_weight_tensor_meta_L_input_ids_: + name = "L_input_ids_" + shape = [1, 11] + dtype = "torch.int64" + device = "cuda:0" + mean = None + std = None + data = [0, 45288, 15721, 16, 341, 7, 14660, 38163, 36386, 4, 2] + + +class Program_weight_tensor_meta_L_self_modules_embeddings_buffers_token_type_ids_: + name = "L_self_modules_embeddings_buffers_token_type_ids_" + shape = [1, 514] + dtype = "torch.int64" + device = "cuda:0" + mean = None + std = None + min_val = 0 + max_val = 0 + + +class Program_weight_tensor_meta_L_self_modules_embeddings_modules_word_embeddings_parameters_weight_: + name = "L_self_modules_embeddings_modules_word_embeddings_parameters_weight_" + shape = [50265, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_embeddings_modules_token_type_embeddings_parameters_weight_: + name = "L_self_modules_embeddings_modules_token_type_embeddings_parameters_weight_" + shape = [1, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_embeddings_modules_position_embeddings_parameters_weight_: + name = "L_self_modules_embeddings_modules_position_embeddings_parameters_weight_" + shape = [514, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_embeddings_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_embeddings_modules_LayerNorm_parameters_weight_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_embeddings_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_embeddings_modules_LayerNorm_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_embeddings_modules_dropout_p: + name = "L_self_modules_embeddings_modules_dropout_p" + shape = [] + dtype = "torch.float64" + device = "cpu" + mean = 0.100 + std = 0.000 + data = [0.100000] + + +class Program_weight_tensor_meta_L_attention_mask_: + name = "L_attention_mask_" + shape = [1, 11] + dtype = "torch.int64" + device = "cuda:0" + mean = None + std = None + data = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_query_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_query_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_query_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_query_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_key_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_key_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_key_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_key_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_value_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_value_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_value_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_value_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dense_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dense_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dropout_p: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dropout_p" + shape = [] + dtype = "torch.float64" + device = "cpu" + mean = 0.100 + std = 0.000 + data = [0.100000] + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_LayerNorm_parameters_weight_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_LayerNorm_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_intermediate_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_intermediate_modules_dense_parameters_weight_" + shape = [3072, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_intermediate_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_intermediate_modules_dense_parameters_bias_" + shape = [3072] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dense_parameters_weight_" + shape = [768, 3072] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dense_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dropout_p: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dropout_p" + shape = [] + dtype = "torch.float64" + device = "cpu" + mean = 0.100 + std = 0.000 + data = [0.100000] + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_LayerNorm_parameters_weight_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_LayerNorm_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_query_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_query_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_query_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_query_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_key_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_key_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_key_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_key_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_value_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_value_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_value_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_value_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dense_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dense_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dropout_p: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dropout_p" + shape = [] + dtype = "torch.float64" + device = "cpu" + mean = 0.100 + std = 0.000 + data = [0.100000] + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_LayerNorm_parameters_weight_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_LayerNorm_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_intermediate_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_intermediate_modules_dense_parameters_weight_" + shape = [3072, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_intermediate_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_intermediate_modules_dense_parameters_bias_" + shape = [3072] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dense_parameters_weight_" + shape = [768, 3072] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dense_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dropout_p: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dropout_p" + shape = [] + dtype = "torch.float64" + device = "cpu" + mean = 0.100 + std = 0.000 + data = [0.100000] + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_LayerNorm_parameters_weight_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_LayerNorm_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_query_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_query_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_query_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_query_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_key_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_key_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_key_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_key_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_value_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_value_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_value_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_value_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_dense_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_dense_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_dropout_p: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_dropout_p" + shape = [] + dtype = "torch.float64" + device = "cpu" + mean = 0.100 + std = 0.000 + data = [0.100000] + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_LayerNorm_parameters_weight_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_LayerNorm_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_intermediate_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_intermediate_modules_dense_parameters_weight_" + shape = [3072, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_intermediate_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_intermediate_modules_dense_parameters_bias_" + shape = [3072] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_dense_parameters_weight_" + shape = [768, 3072] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_dense_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_dropout_p: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_dropout_p" + shape = [] + dtype = "torch.float64" + device = "cpu" + mean = 0.100 + std = 0.000 + data = [0.100000] + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_LayerNorm_parameters_weight_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_LayerNorm_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_query_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_query_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_query_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_query_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_key_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_key_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_key_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_key_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_value_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_value_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_value_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_value_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_dense_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_dense_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_dropout_p: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_dropout_p" + shape = [] + dtype = "torch.float64" + device = "cpu" + mean = 0.100 + std = 0.000 + data = [0.100000] + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_LayerNorm_parameters_weight_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_LayerNorm_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_intermediate_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_intermediate_modules_dense_parameters_weight_" + shape = [3072, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_intermediate_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_intermediate_modules_dense_parameters_bias_" + shape = [3072] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_dense_parameters_weight_" + shape = [768, 3072] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_dense_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_dropout_p: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_dropout_p" + shape = [] + dtype = "torch.float64" + device = "cpu" + mean = 0.100 + std = 0.000 + data = [0.100000] + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_LayerNorm_parameters_weight_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_LayerNorm_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_query_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_query_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_query_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_query_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_key_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_key_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_key_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_key_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_value_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_value_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_value_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_value_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_dense_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_dense_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_dropout_p: + name = "L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_dropout_p" + shape = [] + dtype = "torch.float64" + device = "cpu" + mean = 0.100 + std = 0.000 + data = [0.100000] + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_LayerNorm_parameters_weight_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_LayerNorm_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_4_modules_intermediate_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_4_modules_intermediate_modules_dense_parameters_weight_" + shape = [3072, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_4_modules_intermediate_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_4_modules_intermediate_modules_dense_parameters_bias_" + shape = [3072] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_4_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_4_modules_output_modules_dense_parameters_weight_" + shape = [768, 3072] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_4_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_4_modules_output_modules_dense_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_4_modules_output_modules_dropout_p: + name = "L_self_modules_encoder_modules_layer_modules_4_modules_output_modules_dropout_p" + shape = [] + dtype = "torch.float64" + device = "cpu" + mean = 0.100 + std = 0.000 + data = [0.100000] + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_4_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_4_modules_output_modules_LayerNorm_parameters_weight_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_4_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_4_modules_output_modules_LayerNorm_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_query_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_query_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_query_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_query_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_key_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_key_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_key_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_key_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_value_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_value_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_value_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_value_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_dense_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_dense_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_dropout_p: + name = "L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_dropout_p" + shape = [] + dtype = "torch.float64" + device = "cpu" + mean = 0.100 + std = 0.000 + data = [0.100000] + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_LayerNorm_parameters_weight_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_LayerNorm_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_5_modules_intermediate_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_5_modules_intermediate_modules_dense_parameters_weight_" + shape = [3072, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_5_modules_intermediate_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_5_modules_intermediate_modules_dense_parameters_bias_" + shape = [3072] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_5_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_5_modules_output_modules_dense_parameters_weight_" + shape = [768, 3072] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_5_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_5_modules_output_modules_dense_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_5_modules_output_modules_dropout_p: + name = "L_self_modules_encoder_modules_layer_modules_5_modules_output_modules_dropout_p" + shape = [] + dtype = "torch.float64" + device = "cpu" + mean = 0.100 + std = 0.000 + data = [0.100000] + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_5_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_5_modules_output_modules_LayerNorm_parameters_weight_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_5_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_5_modules_output_modules_LayerNorm_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_query_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_query_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_query_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_query_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_key_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_key_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_key_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_key_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_value_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_value_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_value_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_value_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_dense_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_dense_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_dropout_p: + name = "L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_dropout_p" + shape = [] + dtype = "torch.float64" + device = "cpu" + mean = 0.100 + std = 0.000 + data = [0.100000] + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_LayerNorm_parameters_weight_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_LayerNorm_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_6_modules_intermediate_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_6_modules_intermediate_modules_dense_parameters_weight_" + shape = [3072, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_6_modules_intermediate_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_6_modules_intermediate_modules_dense_parameters_bias_" + shape = [3072] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_6_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_6_modules_output_modules_dense_parameters_weight_" + shape = [768, 3072] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_6_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_6_modules_output_modules_dense_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_6_modules_output_modules_dropout_p: + name = "L_self_modules_encoder_modules_layer_modules_6_modules_output_modules_dropout_p" + shape = [] + dtype = "torch.float64" + device = "cpu" + mean = 0.100 + std = 0.000 + data = [0.100000] + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_6_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_6_modules_output_modules_LayerNorm_parameters_weight_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_6_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_6_modules_output_modules_LayerNorm_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_query_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_query_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_query_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_query_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_key_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_key_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_key_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_key_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_value_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_value_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_value_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_value_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_dense_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_dense_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_dropout_p: + name = "L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_dropout_p" + shape = [] + dtype = "torch.float64" + device = "cpu" + mean = 0.100 + std = 0.000 + data = [0.100000] + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_LayerNorm_parameters_weight_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_LayerNorm_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_7_modules_intermediate_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_7_modules_intermediate_modules_dense_parameters_weight_" + shape = [3072, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_7_modules_intermediate_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_7_modules_intermediate_modules_dense_parameters_bias_" + shape = [3072] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_7_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_7_modules_output_modules_dense_parameters_weight_" + shape = [768, 3072] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_7_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_7_modules_output_modules_dense_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_7_modules_output_modules_dropout_p: + name = "L_self_modules_encoder_modules_layer_modules_7_modules_output_modules_dropout_p" + shape = [] + dtype = "torch.float64" + device = "cpu" + mean = 0.100 + std = 0.000 + data = [0.100000] + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_7_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_7_modules_output_modules_LayerNorm_parameters_weight_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_7_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_7_modules_output_modules_LayerNorm_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_query_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_query_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_query_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_query_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_key_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_key_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_key_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_key_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_value_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_value_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_value_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_value_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_dense_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_dense_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_dropout_p: + name = "L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_dropout_p" + shape = [] + dtype = "torch.float64" + device = "cpu" + mean = 0.100 + std = 0.000 + data = [0.100000] + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_LayerNorm_parameters_weight_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_LayerNorm_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_8_modules_intermediate_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_8_modules_intermediate_modules_dense_parameters_weight_" + shape = [3072, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_8_modules_intermediate_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_8_modules_intermediate_modules_dense_parameters_bias_" + shape = [3072] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_8_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_8_modules_output_modules_dense_parameters_weight_" + shape = [768, 3072] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_8_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_8_modules_output_modules_dense_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_8_modules_output_modules_dropout_p: + name = "L_self_modules_encoder_modules_layer_modules_8_modules_output_modules_dropout_p" + shape = [] + dtype = "torch.float64" + device = "cpu" + mean = 0.100 + std = 0.000 + data = [0.100000] + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_8_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_8_modules_output_modules_LayerNorm_parameters_weight_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_8_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_8_modules_output_modules_LayerNorm_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_query_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_query_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_query_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_query_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_key_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_key_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_key_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_key_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_value_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_value_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_value_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_value_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_dense_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_dense_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_dropout_p: + name = "L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_dropout_p" + shape = [] + dtype = "torch.float64" + device = "cpu" + mean = 0.100 + std = 0.000 + data = [0.100000] + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_LayerNorm_parameters_weight_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_LayerNorm_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_9_modules_intermediate_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_9_modules_intermediate_modules_dense_parameters_weight_" + shape = [3072, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_9_modules_intermediate_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_9_modules_intermediate_modules_dense_parameters_bias_" + shape = [3072] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_9_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_9_modules_output_modules_dense_parameters_weight_" + shape = [768, 3072] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_9_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_9_modules_output_modules_dense_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_9_modules_output_modules_dropout_p: + name = "L_self_modules_encoder_modules_layer_modules_9_modules_output_modules_dropout_p" + shape = [] + dtype = "torch.float64" + device = "cpu" + mean = 0.100 + std = 0.000 + data = [0.100000] + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_9_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_9_modules_output_modules_LayerNorm_parameters_weight_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_9_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_9_modules_output_modules_LayerNorm_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_query_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_query_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_query_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_query_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_key_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_key_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_key_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_key_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_value_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_value_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_value_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_value_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_dense_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_dense_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_dropout_p: + name = "L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_dropout_p" + shape = [] + dtype = "torch.float64" + device = "cpu" + mean = 0.100 + std = 0.000 + data = [0.100000] + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_LayerNorm_parameters_weight_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_LayerNorm_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_10_modules_intermediate_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_10_modules_intermediate_modules_dense_parameters_weight_" + shape = [3072, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_10_modules_intermediate_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_10_modules_intermediate_modules_dense_parameters_bias_" + shape = [3072] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_10_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_10_modules_output_modules_dense_parameters_weight_" + shape = [768, 3072] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_10_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_10_modules_output_modules_dense_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_10_modules_output_modules_dropout_p: + name = "L_self_modules_encoder_modules_layer_modules_10_modules_output_modules_dropout_p" + shape = [] + dtype = "torch.float64" + device = "cpu" + mean = 0.100 + std = 0.000 + data = [0.100000] + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_10_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_10_modules_output_modules_LayerNorm_parameters_weight_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_10_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_10_modules_output_modules_LayerNorm_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_query_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_query_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_query_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_query_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_key_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_key_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_key_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_key_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_value_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_value_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_value_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_value_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_dense_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_dense_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_dropout_p: + name = "L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_dropout_p" + shape = [] + dtype = "torch.float64" + device = "cpu" + mean = 0.100 + std = 0.000 + data = [0.100000] + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_LayerNorm_parameters_weight_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_LayerNorm_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_11_modules_intermediate_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_11_modules_intermediate_modules_dense_parameters_weight_" + shape = [3072, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_11_modules_intermediate_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_11_modules_intermediate_modules_dense_parameters_bias_" + shape = [3072] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_11_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_11_modules_output_modules_dense_parameters_weight_" + shape = [768, 3072] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_11_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_11_modules_output_modules_dense_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_11_modules_output_modules_dropout_p: + name = "L_self_modules_encoder_modules_layer_modules_11_modules_output_modules_dropout_p" + shape = [] + dtype = "torch.float64" + device = "cpu" + mean = 0.100 + std = 0.000 + data = [0.100000] + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_11_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_11_modules_output_modules_LayerNorm_parameters_weight_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_11_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_11_modules_output_modules_LayerNorm_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_pooler_modules_dense_parameters_weight_: + name = "L_self_modules_pooler_modules_dense_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_pooler_modules_dense_parameters_bias_: + name = "L_self_modules_pooler_modules_dense_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None diff --git a/samples/transformers-auto-model/gonchisi_roberta-base-bne-finetuned-new_or_used-title/graph_hash.txt b/samples/transformers-auto-model/gonchisi_roberta-base-bne-finetuned-new_or_used-title/graph_hash.txt new file mode 100644 index 000000000..351216527 --- /dev/null +++ b/samples/transformers-auto-model/gonchisi_roberta-base-bne-finetuned-new_or_used-title/graph_hash.txt @@ -0,0 +1 @@ +ddf8330c24498ecc85536feb3baedb3ce918561f7e3efe053c569b8034d80228 \ No newline at end of file diff --git a/samples/transformers-auto-model/gonchisi_roberta-base-bne-finetuned-new_or_used-title/graph_net.json b/samples/transformers-auto-model/gonchisi_roberta-base-bne-finetuned-new_or_used-title/graph_net.json new file mode 100644 index 000000000..b6ffe9f72 --- /dev/null +++ b/samples/transformers-auto-model/gonchisi_roberta-base-bne-finetuned-new_or_used-title/graph_net.json @@ -0,0 +1,6 @@ +{ + "framework": "torch", + "num_devices_required": 1, + "num_nodes_required": 1, + "dynamic": false +} \ No newline at end of file diff --git a/samples/transformers-auto-model/gonchisi_roberta-base-bne-finetuned-new_or_used-title/input_meta.py b/samples/transformers-auto-model/gonchisi_roberta-base-bne-finetuned-new_or_used-title/input_meta.py new file mode 100644 index 000000000..e69de29bb diff --git a/samples/transformers-auto-model/gonchisi_roberta-base-bne-finetuned-new_or_used-title/input_tensor_constraints.py b/samples/transformers-auto-model/gonchisi_roberta-base-bne-finetuned-new_or_used-title/input_tensor_constraints.py new file mode 100644 index 000000000..e69de29bb diff --git a/samples/transformers-auto-model/gonchisi_roberta-base-bne-finetuned-new_or_used-title/model.py b/samples/transformers-auto-model/gonchisi_roberta-base-bne-finetuned-new_or_used-title/model.py new file mode 100644 index 000000000..c37698055 --- /dev/null +++ b/samples/transformers-auto-model/gonchisi_roberta-base-bne-finetuned-new_or_used-title/model.py @@ -0,0 +1,1818 @@ +import torch + + +class GraphModule(torch.nn.Module): + def forward( + self, + L_input_ids_: torch.Tensor, + L_self_modules_embeddings_buffers_token_type_ids_: torch.Tensor, + L_self_modules_embeddings_modules_word_embeddings_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_embeddings_modules_token_type_embeddings_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_embeddings_modules_position_embeddings_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_embeddings_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_embeddings_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_embeddings_modules_dropout_p: torch.Tensor, + L_attention_mask_: torch.Tensor, + L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_query_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_query_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_key_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_key_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dropout_p: torch.Tensor, + L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_intermediate_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_intermediate_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dropout_p: torch.Tensor, + L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_query_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_query_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_key_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_key_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dropout_p: torch.Tensor, + L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_intermediate_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_intermediate_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dropout_p: torch.Tensor, + L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_query_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_query_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_key_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_key_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_dropout_p: torch.Tensor, + L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_intermediate_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_intermediate_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_dropout_p: torch.Tensor, + L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_query_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_query_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_key_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_key_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_dropout_p: torch.Tensor, + L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_intermediate_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_intermediate_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_dropout_p: torch.Tensor, + L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_query_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_query_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_key_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_key_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_dropout_p: torch.Tensor, + L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_4_modules_intermediate_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_4_modules_intermediate_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_4_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_4_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_4_modules_output_modules_dropout_p: torch.Tensor, + L_self_modules_encoder_modules_layer_modules_4_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_4_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_query_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_query_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_key_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_key_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_dropout_p: torch.Tensor, + L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_5_modules_intermediate_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_5_modules_intermediate_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_5_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_5_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_5_modules_output_modules_dropout_p: torch.Tensor, + L_self_modules_encoder_modules_layer_modules_5_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_5_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_query_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_query_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_key_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_key_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_dropout_p: torch.Tensor, + L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_6_modules_intermediate_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_6_modules_intermediate_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_6_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_6_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_6_modules_output_modules_dropout_p: torch.Tensor, + L_self_modules_encoder_modules_layer_modules_6_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_6_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_query_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_query_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_key_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_key_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_dropout_p: torch.Tensor, + L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_7_modules_intermediate_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_7_modules_intermediate_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_7_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_7_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_7_modules_output_modules_dropout_p: torch.Tensor, + L_self_modules_encoder_modules_layer_modules_7_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_7_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_query_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_query_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_key_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_key_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_dropout_p: torch.Tensor, + L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_8_modules_intermediate_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_8_modules_intermediate_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_8_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_8_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_8_modules_output_modules_dropout_p: torch.Tensor, + L_self_modules_encoder_modules_layer_modules_8_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_8_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_query_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_query_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_key_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_key_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_dropout_p: torch.Tensor, + L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_9_modules_intermediate_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_9_modules_intermediate_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_9_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_9_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_9_modules_output_modules_dropout_p: torch.Tensor, + L_self_modules_encoder_modules_layer_modules_9_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_9_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_query_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_query_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_key_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_key_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_dropout_p: torch.Tensor, + L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_10_modules_intermediate_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_10_modules_intermediate_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_10_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_10_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_10_modules_output_modules_dropout_p: torch.Tensor, + L_self_modules_encoder_modules_layer_modules_10_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_10_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_query_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_query_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_key_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_key_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_dropout_p: torch.Tensor, + L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_11_modules_intermediate_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_11_modules_intermediate_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_11_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_11_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_11_modules_output_modules_dropout_p: torch.Tensor, + L_self_modules_encoder_modules_layer_modules_11_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_11_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_pooler_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_pooler_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + ): + l_input_ids_ = L_input_ids_ + l_self_modules_embeddings_buffers_token_type_ids_ = ( + L_self_modules_embeddings_buffers_token_type_ids_ + ) + l_self_modules_embeddings_modules_word_embeddings_parameters_weight_ = ( + L_self_modules_embeddings_modules_word_embeddings_parameters_weight_ + ) + l_self_modules_embeddings_modules_token_type_embeddings_parameters_weight_ = ( + L_self_modules_embeddings_modules_token_type_embeddings_parameters_weight_ + ) + l_self_modules_embeddings_modules_position_embeddings_parameters_weight_ = ( + L_self_modules_embeddings_modules_position_embeddings_parameters_weight_ + ) + l_self_modules_embeddings_modules_layer_norm_parameters_weight_ = ( + L_self_modules_embeddings_modules_LayerNorm_parameters_weight_ + ) + l_self_modules_embeddings_modules_layer_norm_parameters_bias_ = ( + L_self_modules_embeddings_modules_LayerNorm_parameters_bias_ + ) + l_self_modules_embeddings_modules_dropout_p = ( + L_self_modules_embeddings_modules_dropout_p + ) + l_attention_mask_ = L_attention_mask_ + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_query_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_query_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_query_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_query_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_key_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_key_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_key_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_key_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_value_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_value_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_value_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_value_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dropout_p = L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dropout_p + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_0_modules_intermediate_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_0_modules_intermediate_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_0_modules_intermediate_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_0_modules_intermediate_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dropout_p = L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dropout_p + l_self_modules_encoder_modules_layer_modules_0_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_0_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_query_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_query_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_query_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_query_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_key_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_key_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_key_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_key_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_value_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_value_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_value_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_value_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dropout_p = L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dropout_p + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_1_modules_intermediate_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_1_modules_intermediate_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_1_modules_intermediate_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_1_modules_intermediate_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dropout_p = L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dropout_p + l_self_modules_encoder_modules_layer_modules_1_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_1_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_query_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_query_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_query_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_query_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_key_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_key_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_key_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_key_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_value_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_value_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_value_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_value_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_dropout_p = L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_dropout_p + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_2_modules_intermediate_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_2_modules_intermediate_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_2_modules_intermediate_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_2_modules_intermediate_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_2_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_2_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_2_modules_output_modules_dropout_p = L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_dropout_p + l_self_modules_encoder_modules_layer_modules_2_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_2_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_query_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_query_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_query_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_query_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_key_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_key_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_key_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_key_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_value_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_value_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_value_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_value_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_dropout_p = L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_dropout_p + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_3_modules_intermediate_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_3_modules_intermediate_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_3_modules_intermediate_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_3_modules_intermediate_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_3_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_3_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_3_modules_output_modules_dropout_p = L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_dropout_p + l_self_modules_encoder_modules_layer_modules_3_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_3_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_query_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_query_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_query_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_query_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_key_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_key_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_key_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_key_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_value_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_value_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_value_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_value_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_dropout_p = L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_dropout_p + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_4_modules_intermediate_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_4_modules_intermediate_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_4_modules_intermediate_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_4_modules_intermediate_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_4_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_4_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_4_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_4_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_4_modules_output_modules_dropout_p = L_self_modules_encoder_modules_layer_modules_4_modules_output_modules_dropout_p + l_self_modules_encoder_modules_layer_modules_4_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_4_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_4_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_4_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_query_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_query_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_query_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_query_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_key_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_key_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_key_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_key_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_value_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_value_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_value_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_value_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_dropout_p = L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_dropout_p + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_5_modules_intermediate_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_5_modules_intermediate_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_5_modules_intermediate_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_5_modules_intermediate_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_5_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_5_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_5_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_5_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_5_modules_output_modules_dropout_p = L_self_modules_encoder_modules_layer_modules_5_modules_output_modules_dropout_p + l_self_modules_encoder_modules_layer_modules_5_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_5_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_5_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_5_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_query_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_query_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_query_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_query_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_key_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_key_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_key_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_key_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_value_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_value_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_value_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_value_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_dropout_p = L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_dropout_p + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_6_modules_intermediate_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_6_modules_intermediate_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_6_modules_intermediate_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_6_modules_intermediate_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_6_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_6_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_6_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_6_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_6_modules_output_modules_dropout_p = L_self_modules_encoder_modules_layer_modules_6_modules_output_modules_dropout_p + l_self_modules_encoder_modules_layer_modules_6_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_6_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_6_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_6_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_query_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_query_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_query_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_query_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_key_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_key_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_key_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_key_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_value_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_value_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_value_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_value_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_dropout_p = L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_dropout_p + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_7_modules_intermediate_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_7_modules_intermediate_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_7_modules_intermediate_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_7_modules_intermediate_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_7_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_7_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_7_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_7_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_7_modules_output_modules_dropout_p = L_self_modules_encoder_modules_layer_modules_7_modules_output_modules_dropout_p + l_self_modules_encoder_modules_layer_modules_7_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_7_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_7_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_7_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_query_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_query_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_query_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_query_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_key_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_key_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_key_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_key_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_value_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_value_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_value_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_value_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_dropout_p = L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_dropout_p + l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_8_modules_intermediate_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_8_modules_intermediate_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_8_modules_intermediate_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_8_modules_intermediate_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_8_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_8_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_8_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_8_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_8_modules_output_modules_dropout_p = L_self_modules_encoder_modules_layer_modules_8_modules_output_modules_dropout_p + l_self_modules_encoder_modules_layer_modules_8_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_8_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_8_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_8_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_query_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_query_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_query_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_query_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_key_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_key_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_key_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_key_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_value_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_value_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_value_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_value_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_dropout_p = L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_dropout_p + l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_9_modules_intermediate_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_9_modules_intermediate_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_9_modules_intermediate_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_9_modules_intermediate_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_9_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_9_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_9_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_9_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_9_modules_output_modules_dropout_p = L_self_modules_encoder_modules_layer_modules_9_modules_output_modules_dropout_p + l_self_modules_encoder_modules_layer_modules_9_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_9_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_9_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_9_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_query_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_query_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_query_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_query_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_key_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_key_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_key_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_key_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_value_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_value_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_value_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_value_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_dropout_p = L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_dropout_p + l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_10_modules_intermediate_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_10_modules_intermediate_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_10_modules_intermediate_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_10_modules_intermediate_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_10_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_10_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_10_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_10_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_10_modules_output_modules_dropout_p = L_self_modules_encoder_modules_layer_modules_10_modules_output_modules_dropout_p + l_self_modules_encoder_modules_layer_modules_10_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_10_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_10_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_10_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_query_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_query_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_query_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_query_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_key_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_key_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_key_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_key_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_value_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_value_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_value_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_value_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_dropout_p = L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_dropout_p + l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_11_modules_intermediate_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_11_modules_intermediate_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_11_modules_intermediate_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_11_modules_intermediate_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_11_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_11_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_11_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_11_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_11_modules_output_modules_dropout_p = L_self_modules_encoder_modules_layer_modules_11_modules_output_modules_dropout_p + l_self_modules_encoder_modules_layer_modules_11_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_11_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_11_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_11_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_pooler_modules_dense_parameters_weight_ = ( + L_self_modules_pooler_modules_dense_parameters_weight_ + ) + l_self_modules_pooler_modules_dense_parameters_bias_ = ( + L_self_modules_pooler_modules_dense_parameters_bias_ + ) + buffered_token_type_ids = l_self_modules_embeddings_buffers_token_type_ids_[ + (slice(None, None, None), slice(None, 17, None)) + ] + l_self_modules_embeddings_buffers_token_type_ids_ = None + buffered_token_type_ids_expanded = buffered_token_type_ids.expand(1, 17) + buffered_token_type_ids = None + ne = l_input_ids_.ne(1) + mask = ne.int() + ne = None + cumsum = torch.cumsum(mask, dim=1) + type_as = cumsum.type_as(mask) + cumsum = None + add = type_as + 0 + type_as = None + incremental_indices = add * mask + add = mask = None + long = incremental_indices.long() + incremental_indices = None + position_ids = long + 1 + long = None + inputs_embeds = torch.nn.functional.embedding( + l_input_ids_, + l_self_modules_embeddings_modules_word_embeddings_parameters_weight_, + 1, + None, + 2.0, + False, + False, + ) + l_input_ids_ = ( + l_self_modules_embeddings_modules_word_embeddings_parameters_weight_ + ) = None + token_type_embeddings = torch.nn.functional.embedding( + buffered_token_type_ids_expanded, + l_self_modules_embeddings_modules_token_type_embeddings_parameters_weight_, + None, + None, + 2.0, + False, + False, + ) + buffered_token_type_ids_expanded = ( + l_self_modules_embeddings_modules_token_type_embeddings_parameters_weight_ + ) = None + embeddings = inputs_embeds + token_type_embeddings + inputs_embeds = token_type_embeddings = None + position_embeddings = torch.nn.functional.embedding( + position_ids, + l_self_modules_embeddings_modules_position_embeddings_parameters_weight_, + 1, + None, + 2.0, + False, + False, + ) + position_ids = ( + l_self_modules_embeddings_modules_position_embeddings_parameters_weight_ + ) = None + embeddings += position_embeddings + embeddings_1 = embeddings + embeddings = position_embeddings = None + embeddings_2 = torch.nn.functional.layer_norm( + embeddings_1, + (768,), + l_self_modules_embeddings_modules_layer_norm_parameters_weight_, + l_self_modules_embeddings_modules_layer_norm_parameters_bias_, + 1e-05, + ) + embeddings_1 = ( + l_self_modules_embeddings_modules_layer_norm_parameters_weight_ + ) = l_self_modules_embeddings_modules_layer_norm_parameters_bias_ = None + item = l_self_modules_embeddings_modules_dropout_p.item() + l_self_modules_embeddings_modules_dropout_p = None + embeddings_3 = torch.nn.functional.dropout(embeddings_2, item, False, False) + embeddings_2 = item = None + getitem_1 = l_attention_mask_[ + (slice(None, None, None), None, None, slice(None, None, None)) + ] + l_attention_mask_ = None + expand_1 = getitem_1.expand(1, 1, 17, 17) + getitem_1 = None + expanded_mask = expand_1.to(torch.float32) + expand_1 = None + tensor = torch.tensor(1.0, dtype=torch.float32) + inverted_mask = tensor - expanded_mask + tensor = expanded_mask = None + to_1 = inverted_mask.to(torch.bool) + extended_attention_mask = inverted_mask.masked_fill( + to_1, -3.4028234663852886e38 + ) + inverted_mask = to_1 = None + linear = torch._C._nn.linear( + embeddings_3, + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_query_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_query_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_query_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_query_parameters_bias_ = (None) + view = linear.view(1, -1, 12, 64) + linear = None + query_layer = view.transpose(1, 2) + view = None + linear_1 = torch._C._nn.linear( + embeddings_3, + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_key_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_key_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_key_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_key_parameters_bias_ = (None) + view_1 = linear_1.view(1, -1, 12, 64) + linear_1 = None + key_layer = view_1.transpose(1, 2) + view_1 = None + linear_2 = torch._C._nn.linear( + embeddings_3, + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_value_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_value_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_value_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_value_parameters_bias_ = (None) + view_2 = linear_2.view(1, -1, 12, 64) + linear_2 = None + value_layer = view_2.transpose(1, 2) + view_2 = None + attn_output = torch._C._nn.scaled_dot_product_attention( + query_layer, + key_layer, + value_layer, + attn_mask=extended_attention_mask, + dropout_p=0.0, + is_causal=False, + ) + query_layer = key_layer = value_layer = None + attn_output_1 = attn_output.transpose(1, 2) + attn_output = None + attn_output_2 = attn_output_1.reshape(1, 17, 768) + attn_output_1 = None + hidden_states = torch._C._nn.linear( + attn_output_2, + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dense_parameters_bias_, + ) + attn_output_2 = l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dense_parameters_bias_ = (None) + item_1 = ( + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dropout_p.item() + ) + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dropout_p = ( + None + ) + hidden_states_1 = torch.nn.functional.dropout( + hidden_states, item_1, False, False + ) + hidden_states = item_1 = None + add_3 = hidden_states_1 + embeddings_3 + hidden_states_1 = embeddings_3 = None + hidden_states_2 = torch.nn.functional.layer_norm( + add_3, + (768,), + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_layer_norm_parameters_bias_, + 1e-05, + ) + add_3 = l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = (None) + hidden_states_3 = torch._C._nn.linear( + hidden_states_2, + l_self_modules_encoder_modules_layer_modules_0_modules_intermediate_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_0_modules_intermediate_modules_dense_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_0_modules_intermediate_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_0_modules_intermediate_modules_dense_parameters_bias_ = (None) + hidden_states_4 = torch._C._nn.gelu(hidden_states_3) + hidden_states_3 = None + hidden_states_5 = torch._C._nn.linear( + hidden_states_4, + l_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dense_parameters_bias_, + ) + hidden_states_4 = l_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dense_parameters_bias_ = (None) + item_2 = ( + l_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dropout_p.item() + ) + l_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dropout_p = ( + None + ) + hidden_states_6 = torch.nn.functional.dropout( + hidden_states_5, item_2, False, False + ) + hidden_states_5 = item_2 = None + add_4 = hidden_states_6 + hidden_states_2 + hidden_states_6 = hidden_states_2 = None + hidden_states_7 = torch.nn.functional.layer_norm( + add_4, + (768,), + l_self_modules_encoder_modules_layer_modules_0_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_0_modules_output_modules_layer_norm_parameters_bias_, + 1e-05, + ) + add_4 = l_self_modules_encoder_modules_layer_modules_0_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_0_modules_output_modules_layer_norm_parameters_bias_ = (None) + linear_6 = torch._C._nn.linear( + hidden_states_7, + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_query_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_query_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_query_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_query_parameters_bias_ = (None) + view_3 = linear_6.view(1, -1, 12, 64) + linear_6 = None + query_layer_1 = view_3.transpose(1, 2) + view_3 = None + linear_7 = torch._C._nn.linear( + hidden_states_7, + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_key_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_key_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_key_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_key_parameters_bias_ = (None) + view_4 = linear_7.view(1, -1, 12, 64) + linear_7 = None + key_layer_1 = view_4.transpose(1, 2) + view_4 = None + linear_8 = torch._C._nn.linear( + hidden_states_7, + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_value_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_value_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_value_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_value_parameters_bias_ = (None) + view_5 = linear_8.view(1, -1, 12, 64) + linear_8 = None + value_layer_1 = view_5.transpose(1, 2) + view_5 = None + attn_output_3 = torch._C._nn.scaled_dot_product_attention( + query_layer_1, + key_layer_1, + value_layer_1, + attn_mask=extended_attention_mask, + dropout_p=0.0, + is_causal=False, + ) + query_layer_1 = key_layer_1 = value_layer_1 = None + attn_output_4 = attn_output_3.transpose(1, 2) + attn_output_3 = None + attn_output_5 = attn_output_4.reshape(1, 17, 768) + attn_output_4 = None + hidden_states_8 = torch._C._nn.linear( + attn_output_5, + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dense_parameters_bias_, + ) + attn_output_5 = l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dense_parameters_bias_ = (None) + item_3 = ( + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dropout_p.item() + ) + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dropout_p = ( + None + ) + hidden_states_9 = torch.nn.functional.dropout( + hidden_states_8, item_3, False, False + ) + hidden_states_8 = item_3 = None + add_5 = hidden_states_9 + hidden_states_7 + hidden_states_9 = hidden_states_7 = None + hidden_states_10 = torch.nn.functional.layer_norm( + add_5, + (768,), + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_layer_norm_parameters_bias_, + 1e-05, + ) + add_5 = l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = (None) + hidden_states_11 = torch._C._nn.linear( + hidden_states_10, + l_self_modules_encoder_modules_layer_modules_1_modules_intermediate_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_1_modules_intermediate_modules_dense_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_1_modules_intermediate_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_1_modules_intermediate_modules_dense_parameters_bias_ = (None) + hidden_states_12 = torch._C._nn.gelu(hidden_states_11) + hidden_states_11 = None + hidden_states_13 = torch._C._nn.linear( + hidden_states_12, + l_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dense_parameters_bias_, + ) + hidden_states_12 = l_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dense_parameters_bias_ = (None) + item_4 = ( + l_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dropout_p.item() + ) + l_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dropout_p = ( + None + ) + hidden_states_14 = torch.nn.functional.dropout( + hidden_states_13, item_4, False, False + ) + hidden_states_13 = item_4 = None + add_6 = hidden_states_14 + hidden_states_10 + hidden_states_14 = hidden_states_10 = None + hidden_states_15 = torch.nn.functional.layer_norm( + add_6, + (768,), + l_self_modules_encoder_modules_layer_modules_1_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_1_modules_output_modules_layer_norm_parameters_bias_, + 1e-05, + ) + add_6 = l_self_modules_encoder_modules_layer_modules_1_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_1_modules_output_modules_layer_norm_parameters_bias_ = (None) + linear_12 = torch._C._nn.linear( + hidden_states_15, + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_query_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_query_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_query_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_query_parameters_bias_ = (None) + view_6 = linear_12.view(1, -1, 12, 64) + linear_12 = None + query_layer_2 = view_6.transpose(1, 2) + view_6 = None + linear_13 = torch._C._nn.linear( + hidden_states_15, + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_key_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_key_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_key_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_key_parameters_bias_ = (None) + view_7 = linear_13.view(1, -1, 12, 64) + linear_13 = None + key_layer_2 = view_7.transpose(1, 2) + view_7 = None + linear_14 = torch._C._nn.linear( + hidden_states_15, + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_value_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_value_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_value_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_value_parameters_bias_ = (None) + view_8 = linear_14.view(1, -1, 12, 64) + linear_14 = None + value_layer_2 = view_8.transpose(1, 2) + view_8 = None + attn_output_6 = torch._C._nn.scaled_dot_product_attention( + query_layer_2, + key_layer_2, + value_layer_2, + attn_mask=extended_attention_mask, + dropout_p=0.0, + is_causal=False, + ) + query_layer_2 = key_layer_2 = value_layer_2 = None + attn_output_7 = attn_output_6.transpose(1, 2) + attn_output_6 = None + attn_output_8 = attn_output_7.reshape(1, 17, 768) + attn_output_7 = None + hidden_states_16 = torch._C._nn.linear( + attn_output_8, + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_dense_parameters_bias_, + ) + attn_output_8 = l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_dense_parameters_bias_ = (None) + item_5 = ( + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_dropout_p.item() + ) + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_dropout_p = ( + None + ) + hidden_states_17 = torch.nn.functional.dropout( + hidden_states_16, item_5, False, False + ) + hidden_states_16 = item_5 = None + add_7 = hidden_states_17 + hidden_states_15 + hidden_states_17 = hidden_states_15 = None + hidden_states_18 = torch.nn.functional.layer_norm( + add_7, + (768,), + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_layer_norm_parameters_bias_, + 1e-05, + ) + add_7 = l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = (None) + hidden_states_19 = torch._C._nn.linear( + hidden_states_18, + l_self_modules_encoder_modules_layer_modules_2_modules_intermediate_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_2_modules_intermediate_modules_dense_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_2_modules_intermediate_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_2_modules_intermediate_modules_dense_parameters_bias_ = (None) + hidden_states_20 = torch._C._nn.gelu(hidden_states_19) + hidden_states_19 = None + hidden_states_21 = torch._C._nn.linear( + hidden_states_20, + l_self_modules_encoder_modules_layer_modules_2_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_2_modules_output_modules_dense_parameters_bias_, + ) + hidden_states_20 = l_self_modules_encoder_modules_layer_modules_2_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_2_modules_output_modules_dense_parameters_bias_ = (None) + item_6 = ( + l_self_modules_encoder_modules_layer_modules_2_modules_output_modules_dropout_p.item() + ) + l_self_modules_encoder_modules_layer_modules_2_modules_output_modules_dropout_p = ( + None + ) + hidden_states_22 = torch.nn.functional.dropout( + hidden_states_21, item_6, False, False + ) + hidden_states_21 = item_6 = None + add_8 = hidden_states_22 + hidden_states_18 + hidden_states_22 = hidden_states_18 = None + hidden_states_23 = torch.nn.functional.layer_norm( + add_8, + (768,), + l_self_modules_encoder_modules_layer_modules_2_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_2_modules_output_modules_layer_norm_parameters_bias_, + 1e-05, + ) + add_8 = l_self_modules_encoder_modules_layer_modules_2_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_2_modules_output_modules_layer_norm_parameters_bias_ = (None) + linear_18 = torch._C._nn.linear( + hidden_states_23, + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_query_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_query_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_query_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_query_parameters_bias_ = (None) + view_9 = linear_18.view(1, -1, 12, 64) + linear_18 = None + query_layer_3 = view_9.transpose(1, 2) + view_9 = None + linear_19 = torch._C._nn.linear( + hidden_states_23, + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_key_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_key_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_key_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_key_parameters_bias_ = (None) + view_10 = linear_19.view(1, -1, 12, 64) + linear_19 = None + key_layer_3 = view_10.transpose(1, 2) + view_10 = None + linear_20 = torch._C._nn.linear( + hidden_states_23, + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_value_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_value_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_value_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_value_parameters_bias_ = (None) + view_11 = linear_20.view(1, -1, 12, 64) + linear_20 = None + value_layer_3 = view_11.transpose(1, 2) + view_11 = None + attn_output_9 = torch._C._nn.scaled_dot_product_attention( + query_layer_3, + key_layer_3, + value_layer_3, + attn_mask=extended_attention_mask, + dropout_p=0.0, + is_causal=False, + ) + query_layer_3 = key_layer_3 = value_layer_3 = None + attn_output_10 = attn_output_9.transpose(1, 2) + attn_output_9 = None + attn_output_11 = attn_output_10.reshape(1, 17, 768) + attn_output_10 = None + hidden_states_24 = torch._C._nn.linear( + attn_output_11, + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_dense_parameters_bias_, + ) + attn_output_11 = l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_dense_parameters_bias_ = (None) + item_7 = ( + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_dropout_p.item() + ) + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_dropout_p = ( + None + ) + hidden_states_25 = torch.nn.functional.dropout( + hidden_states_24, item_7, False, False + ) + hidden_states_24 = item_7 = None + add_9 = hidden_states_25 + hidden_states_23 + hidden_states_25 = hidden_states_23 = None + hidden_states_26 = torch.nn.functional.layer_norm( + add_9, + (768,), + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_layer_norm_parameters_bias_, + 1e-05, + ) + add_9 = l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = (None) + hidden_states_27 = torch._C._nn.linear( + hidden_states_26, + l_self_modules_encoder_modules_layer_modules_3_modules_intermediate_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_3_modules_intermediate_modules_dense_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_3_modules_intermediate_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_3_modules_intermediate_modules_dense_parameters_bias_ = (None) + hidden_states_28 = torch._C._nn.gelu(hidden_states_27) + hidden_states_27 = None + hidden_states_29 = torch._C._nn.linear( + hidden_states_28, + l_self_modules_encoder_modules_layer_modules_3_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_3_modules_output_modules_dense_parameters_bias_, + ) + hidden_states_28 = l_self_modules_encoder_modules_layer_modules_3_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_3_modules_output_modules_dense_parameters_bias_ = (None) + item_8 = ( + l_self_modules_encoder_modules_layer_modules_3_modules_output_modules_dropout_p.item() + ) + l_self_modules_encoder_modules_layer_modules_3_modules_output_modules_dropout_p = ( + None + ) + hidden_states_30 = torch.nn.functional.dropout( + hidden_states_29, item_8, False, False + ) + hidden_states_29 = item_8 = None + add_10 = hidden_states_30 + hidden_states_26 + hidden_states_30 = hidden_states_26 = None + hidden_states_31 = torch.nn.functional.layer_norm( + add_10, + (768,), + l_self_modules_encoder_modules_layer_modules_3_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_3_modules_output_modules_layer_norm_parameters_bias_, + 1e-05, + ) + add_10 = l_self_modules_encoder_modules_layer_modules_3_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_3_modules_output_modules_layer_norm_parameters_bias_ = (None) + linear_24 = torch._C._nn.linear( + hidden_states_31, + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_query_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_query_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_query_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_query_parameters_bias_ = (None) + view_12 = linear_24.view(1, -1, 12, 64) + linear_24 = None + query_layer_4 = view_12.transpose(1, 2) + view_12 = None + linear_25 = torch._C._nn.linear( + hidden_states_31, + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_key_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_key_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_key_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_key_parameters_bias_ = (None) + view_13 = linear_25.view(1, -1, 12, 64) + linear_25 = None + key_layer_4 = view_13.transpose(1, 2) + view_13 = None + linear_26 = torch._C._nn.linear( + hidden_states_31, + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_value_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_value_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_value_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_value_parameters_bias_ = (None) + view_14 = linear_26.view(1, -1, 12, 64) + linear_26 = None + value_layer_4 = view_14.transpose(1, 2) + view_14 = None + attn_output_12 = torch._C._nn.scaled_dot_product_attention( + query_layer_4, + key_layer_4, + value_layer_4, + attn_mask=extended_attention_mask, + dropout_p=0.0, + is_causal=False, + ) + query_layer_4 = key_layer_4 = value_layer_4 = None + attn_output_13 = attn_output_12.transpose(1, 2) + attn_output_12 = None + attn_output_14 = attn_output_13.reshape(1, 17, 768) + attn_output_13 = None + hidden_states_32 = torch._C._nn.linear( + attn_output_14, + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_dense_parameters_bias_, + ) + attn_output_14 = l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_dense_parameters_bias_ = (None) + item_9 = ( + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_dropout_p.item() + ) + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_dropout_p = ( + None + ) + hidden_states_33 = torch.nn.functional.dropout( + hidden_states_32, item_9, False, False + ) + hidden_states_32 = item_9 = None + add_11 = hidden_states_33 + hidden_states_31 + hidden_states_33 = hidden_states_31 = None + hidden_states_34 = torch.nn.functional.layer_norm( + add_11, + (768,), + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_layer_norm_parameters_bias_, + 1e-05, + ) + add_11 = l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = (None) + hidden_states_35 = torch._C._nn.linear( + hidden_states_34, + l_self_modules_encoder_modules_layer_modules_4_modules_intermediate_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_4_modules_intermediate_modules_dense_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_4_modules_intermediate_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_4_modules_intermediate_modules_dense_parameters_bias_ = (None) + hidden_states_36 = torch._C._nn.gelu(hidden_states_35) + hidden_states_35 = None + hidden_states_37 = torch._C._nn.linear( + hidden_states_36, + l_self_modules_encoder_modules_layer_modules_4_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_4_modules_output_modules_dense_parameters_bias_, + ) + hidden_states_36 = l_self_modules_encoder_modules_layer_modules_4_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_4_modules_output_modules_dense_parameters_bias_ = (None) + item_10 = ( + l_self_modules_encoder_modules_layer_modules_4_modules_output_modules_dropout_p.item() + ) + l_self_modules_encoder_modules_layer_modules_4_modules_output_modules_dropout_p = ( + None + ) + hidden_states_38 = torch.nn.functional.dropout( + hidden_states_37, item_10, False, False + ) + hidden_states_37 = item_10 = None + add_12 = hidden_states_38 + hidden_states_34 + hidden_states_38 = hidden_states_34 = None + hidden_states_39 = torch.nn.functional.layer_norm( + add_12, + (768,), + l_self_modules_encoder_modules_layer_modules_4_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_4_modules_output_modules_layer_norm_parameters_bias_, + 1e-05, + ) + add_12 = l_self_modules_encoder_modules_layer_modules_4_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_4_modules_output_modules_layer_norm_parameters_bias_ = (None) + linear_30 = torch._C._nn.linear( + hidden_states_39, + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_query_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_query_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_query_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_query_parameters_bias_ = (None) + view_15 = linear_30.view(1, -1, 12, 64) + linear_30 = None + query_layer_5 = view_15.transpose(1, 2) + view_15 = None + linear_31 = torch._C._nn.linear( + hidden_states_39, + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_key_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_key_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_key_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_key_parameters_bias_ = (None) + view_16 = linear_31.view(1, -1, 12, 64) + linear_31 = None + key_layer_5 = view_16.transpose(1, 2) + view_16 = None + linear_32 = torch._C._nn.linear( + hidden_states_39, + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_value_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_value_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_value_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_value_parameters_bias_ = (None) + view_17 = linear_32.view(1, -1, 12, 64) + linear_32 = None + value_layer_5 = view_17.transpose(1, 2) + view_17 = None + attn_output_15 = torch._C._nn.scaled_dot_product_attention( + query_layer_5, + key_layer_5, + value_layer_5, + attn_mask=extended_attention_mask, + dropout_p=0.0, + is_causal=False, + ) + query_layer_5 = key_layer_5 = value_layer_5 = None + attn_output_16 = attn_output_15.transpose(1, 2) + attn_output_15 = None + attn_output_17 = attn_output_16.reshape(1, 17, 768) + attn_output_16 = None + hidden_states_40 = torch._C._nn.linear( + attn_output_17, + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_dense_parameters_bias_, + ) + attn_output_17 = l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_dense_parameters_bias_ = (None) + item_11 = ( + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_dropout_p.item() + ) + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_dropout_p = ( + None + ) + hidden_states_41 = torch.nn.functional.dropout( + hidden_states_40, item_11, False, False + ) + hidden_states_40 = item_11 = None + add_13 = hidden_states_41 + hidden_states_39 + hidden_states_41 = hidden_states_39 = None + hidden_states_42 = torch.nn.functional.layer_norm( + add_13, + (768,), + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_layer_norm_parameters_bias_, + 1e-05, + ) + add_13 = l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = (None) + hidden_states_43 = torch._C._nn.linear( + hidden_states_42, + l_self_modules_encoder_modules_layer_modules_5_modules_intermediate_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_5_modules_intermediate_modules_dense_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_5_modules_intermediate_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_5_modules_intermediate_modules_dense_parameters_bias_ = (None) + hidden_states_44 = torch._C._nn.gelu(hidden_states_43) + hidden_states_43 = None + hidden_states_45 = torch._C._nn.linear( + hidden_states_44, + l_self_modules_encoder_modules_layer_modules_5_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_5_modules_output_modules_dense_parameters_bias_, + ) + hidden_states_44 = l_self_modules_encoder_modules_layer_modules_5_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_5_modules_output_modules_dense_parameters_bias_ = (None) + item_12 = ( + l_self_modules_encoder_modules_layer_modules_5_modules_output_modules_dropout_p.item() + ) + l_self_modules_encoder_modules_layer_modules_5_modules_output_modules_dropout_p = ( + None + ) + hidden_states_46 = torch.nn.functional.dropout( + hidden_states_45, item_12, False, False + ) + hidden_states_45 = item_12 = None + add_14 = hidden_states_46 + hidden_states_42 + hidden_states_46 = hidden_states_42 = None + hidden_states_47 = torch.nn.functional.layer_norm( + add_14, + (768,), + l_self_modules_encoder_modules_layer_modules_5_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_5_modules_output_modules_layer_norm_parameters_bias_, + 1e-05, + ) + add_14 = l_self_modules_encoder_modules_layer_modules_5_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_5_modules_output_modules_layer_norm_parameters_bias_ = (None) + linear_36 = torch._C._nn.linear( + hidden_states_47, + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_query_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_query_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_query_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_query_parameters_bias_ = (None) + view_18 = linear_36.view(1, -1, 12, 64) + linear_36 = None + query_layer_6 = view_18.transpose(1, 2) + view_18 = None + linear_37 = torch._C._nn.linear( + hidden_states_47, + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_key_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_key_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_key_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_key_parameters_bias_ = (None) + view_19 = linear_37.view(1, -1, 12, 64) + linear_37 = None + key_layer_6 = view_19.transpose(1, 2) + view_19 = None + linear_38 = torch._C._nn.linear( + hidden_states_47, + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_value_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_value_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_value_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_value_parameters_bias_ = (None) + view_20 = linear_38.view(1, -1, 12, 64) + linear_38 = None + value_layer_6 = view_20.transpose(1, 2) + view_20 = None + attn_output_18 = torch._C._nn.scaled_dot_product_attention( + query_layer_6, + key_layer_6, + value_layer_6, + attn_mask=extended_attention_mask, + dropout_p=0.0, + is_causal=False, + ) + query_layer_6 = key_layer_6 = value_layer_6 = None + attn_output_19 = attn_output_18.transpose(1, 2) + attn_output_18 = None + attn_output_20 = attn_output_19.reshape(1, 17, 768) + attn_output_19 = None + hidden_states_48 = torch._C._nn.linear( + attn_output_20, + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_dense_parameters_bias_, + ) + attn_output_20 = l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_dense_parameters_bias_ = (None) + item_13 = ( + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_dropout_p.item() + ) + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_dropout_p = ( + None + ) + hidden_states_49 = torch.nn.functional.dropout( + hidden_states_48, item_13, False, False + ) + hidden_states_48 = item_13 = None + add_15 = hidden_states_49 + hidden_states_47 + hidden_states_49 = hidden_states_47 = None + hidden_states_50 = torch.nn.functional.layer_norm( + add_15, + (768,), + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_layer_norm_parameters_bias_, + 1e-05, + ) + add_15 = l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = (None) + hidden_states_51 = torch._C._nn.linear( + hidden_states_50, + l_self_modules_encoder_modules_layer_modules_6_modules_intermediate_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_6_modules_intermediate_modules_dense_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_6_modules_intermediate_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_6_modules_intermediate_modules_dense_parameters_bias_ = (None) + hidden_states_52 = torch._C._nn.gelu(hidden_states_51) + hidden_states_51 = None + hidden_states_53 = torch._C._nn.linear( + hidden_states_52, + l_self_modules_encoder_modules_layer_modules_6_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_6_modules_output_modules_dense_parameters_bias_, + ) + hidden_states_52 = l_self_modules_encoder_modules_layer_modules_6_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_6_modules_output_modules_dense_parameters_bias_ = (None) + item_14 = ( + l_self_modules_encoder_modules_layer_modules_6_modules_output_modules_dropout_p.item() + ) + l_self_modules_encoder_modules_layer_modules_6_modules_output_modules_dropout_p = ( + None + ) + hidden_states_54 = torch.nn.functional.dropout( + hidden_states_53, item_14, False, False + ) + hidden_states_53 = item_14 = None + add_16 = hidden_states_54 + hidden_states_50 + hidden_states_54 = hidden_states_50 = None + hidden_states_55 = torch.nn.functional.layer_norm( + add_16, + (768,), + l_self_modules_encoder_modules_layer_modules_6_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_6_modules_output_modules_layer_norm_parameters_bias_, + 1e-05, + ) + add_16 = l_self_modules_encoder_modules_layer_modules_6_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_6_modules_output_modules_layer_norm_parameters_bias_ = (None) + linear_42 = torch._C._nn.linear( + hidden_states_55, + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_query_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_query_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_query_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_query_parameters_bias_ = (None) + view_21 = linear_42.view(1, -1, 12, 64) + linear_42 = None + query_layer_7 = view_21.transpose(1, 2) + view_21 = None + linear_43 = torch._C._nn.linear( + hidden_states_55, + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_key_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_key_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_key_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_key_parameters_bias_ = (None) + view_22 = linear_43.view(1, -1, 12, 64) + linear_43 = None + key_layer_7 = view_22.transpose(1, 2) + view_22 = None + linear_44 = torch._C._nn.linear( + hidden_states_55, + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_value_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_value_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_value_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_value_parameters_bias_ = (None) + view_23 = linear_44.view(1, -1, 12, 64) + linear_44 = None + value_layer_7 = view_23.transpose(1, 2) + view_23 = None + attn_output_21 = torch._C._nn.scaled_dot_product_attention( + query_layer_7, + key_layer_7, + value_layer_7, + attn_mask=extended_attention_mask, + dropout_p=0.0, + is_causal=False, + ) + query_layer_7 = key_layer_7 = value_layer_7 = None + attn_output_22 = attn_output_21.transpose(1, 2) + attn_output_21 = None + attn_output_23 = attn_output_22.reshape(1, 17, 768) + attn_output_22 = None + hidden_states_56 = torch._C._nn.linear( + attn_output_23, + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_dense_parameters_bias_, + ) + attn_output_23 = l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_dense_parameters_bias_ = (None) + item_15 = ( + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_dropout_p.item() + ) + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_dropout_p = ( + None + ) + hidden_states_57 = torch.nn.functional.dropout( + hidden_states_56, item_15, False, False + ) + hidden_states_56 = item_15 = None + add_17 = hidden_states_57 + hidden_states_55 + hidden_states_57 = hidden_states_55 = None + hidden_states_58 = torch.nn.functional.layer_norm( + add_17, + (768,), + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_layer_norm_parameters_bias_, + 1e-05, + ) + add_17 = l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = (None) + hidden_states_59 = torch._C._nn.linear( + hidden_states_58, + l_self_modules_encoder_modules_layer_modules_7_modules_intermediate_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_7_modules_intermediate_modules_dense_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_7_modules_intermediate_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_7_modules_intermediate_modules_dense_parameters_bias_ = (None) + hidden_states_60 = torch._C._nn.gelu(hidden_states_59) + hidden_states_59 = None + hidden_states_61 = torch._C._nn.linear( + hidden_states_60, + l_self_modules_encoder_modules_layer_modules_7_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_7_modules_output_modules_dense_parameters_bias_, + ) + hidden_states_60 = l_self_modules_encoder_modules_layer_modules_7_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_7_modules_output_modules_dense_parameters_bias_ = (None) + item_16 = ( + l_self_modules_encoder_modules_layer_modules_7_modules_output_modules_dropout_p.item() + ) + l_self_modules_encoder_modules_layer_modules_7_modules_output_modules_dropout_p = ( + None + ) + hidden_states_62 = torch.nn.functional.dropout( + hidden_states_61, item_16, False, False + ) + hidden_states_61 = item_16 = None + add_18 = hidden_states_62 + hidden_states_58 + hidden_states_62 = hidden_states_58 = None + hidden_states_63 = torch.nn.functional.layer_norm( + add_18, + (768,), + l_self_modules_encoder_modules_layer_modules_7_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_7_modules_output_modules_layer_norm_parameters_bias_, + 1e-05, + ) + add_18 = l_self_modules_encoder_modules_layer_modules_7_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_7_modules_output_modules_layer_norm_parameters_bias_ = (None) + linear_48 = torch._C._nn.linear( + hidden_states_63, + l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_query_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_query_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_query_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_query_parameters_bias_ = (None) + view_24 = linear_48.view(1, -1, 12, 64) + linear_48 = None + query_layer_8 = view_24.transpose(1, 2) + view_24 = None + linear_49 = torch._C._nn.linear( + hidden_states_63, + l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_key_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_key_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_key_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_key_parameters_bias_ = (None) + view_25 = linear_49.view(1, -1, 12, 64) + linear_49 = None + key_layer_8 = view_25.transpose(1, 2) + view_25 = None + linear_50 = torch._C._nn.linear( + hidden_states_63, + l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_value_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_value_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_value_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_value_parameters_bias_ = (None) + view_26 = linear_50.view(1, -1, 12, 64) + linear_50 = None + value_layer_8 = view_26.transpose(1, 2) + view_26 = None + attn_output_24 = torch._C._nn.scaled_dot_product_attention( + query_layer_8, + key_layer_8, + value_layer_8, + attn_mask=extended_attention_mask, + dropout_p=0.0, + is_causal=False, + ) + query_layer_8 = key_layer_8 = value_layer_8 = None + attn_output_25 = attn_output_24.transpose(1, 2) + attn_output_24 = None + attn_output_26 = attn_output_25.reshape(1, 17, 768) + attn_output_25 = None + hidden_states_64 = torch._C._nn.linear( + attn_output_26, + l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_dense_parameters_bias_, + ) + attn_output_26 = l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_dense_parameters_bias_ = (None) + item_17 = ( + l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_dropout_p.item() + ) + l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_dropout_p = ( + None + ) + hidden_states_65 = torch.nn.functional.dropout( + hidden_states_64, item_17, False, False + ) + hidden_states_64 = item_17 = None + add_19 = hidden_states_65 + hidden_states_63 + hidden_states_65 = hidden_states_63 = None + hidden_states_66 = torch.nn.functional.layer_norm( + add_19, + (768,), + l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_layer_norm_parameters_bias_, + 1e-05, + ) + add_19 = l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = (None) + hidden_states_67 = torch._C._nn.linear( + hidden_states_66, + l_self_modules_encoder_modules_layer_modules_8_modules_intermediate_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_8_modules_intermediate_modules_dense_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_8_modules_intermediate_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_8_modules_intermediate_modules_dense_parameters_bias_ = (None) + hidden_states_68 = torch._C._nn.gelu(hidden_states_67) + hidden_states_67 = None + hidden_states_69 = torch._C._nn.linear( + hidden_states_68, + l_self_modules_encoder_modules_layer_modules_8_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_8_modules_output_modules_dense_parameters_bias_, + ) + hidden_states_68 = l_self_modules_encoder_modules_layer_modules_8_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_8_modules_output_modules_dense_parameters_bias_ = (None) + item_18 = ( + l_self_modules_encoder_modules_layer_modules_8_modules_output_modules_dropout_p.item() + ) + l_self_modules_encoder_modules_layer_modules_8_modules_output_modules_dropout_p = ( + None + ) + hidden_states_70 = torch.nn.functional.dropout( + hidden_states_69, item_18, False, False + ) + hidden_states_69 = item_18 = None + add_20 = hidden_states_70 + hidden_states_66 + hidden_states_70 = hidden_states_66 = None + hidden_states_71 = torch.nn.functional.layer_norm( + add_20, + (768,), + l_self_modules_encoder_modules_layer_modules_8_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_8_modules_output_modules_layer_norm_parameters_bias_, + 1e-05, + ) + add_20 = l_self_modules_encoder_modules_layer_modules_8_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_8_modules_output_modules_layer_norm_parameters_bias_ = (None) + linear_54 = torch._C._nn.linear( + hidden_states_71, + l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_query_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_query_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_query_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_query_parameters_bias_ = (None) + view_27 = linear_54.view(1, -1, 12, 64) + linear_54 = None + query_layer_9 = view_27.transpose(1, 2) + view_27 = None + linear_55 = torch._C._nn.linear( + hidden_states_71, + l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_key_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_key_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_key_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_key_parameters_bias_ = (None) + view_28 = linear_55.view(1, -1, 12, 64) + linear_55 = None + key_layer_9 = view_28.transpose(1, 2) + view_28 = None + linear_56 = torch._C._nn.linear( + hidden_states_71, + l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_value_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_value_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_value_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_value_parameters_bias_ = (None) + view_29 = linear_56.view(1, -1, 12, 64) + linear_56 = None + value_layer_9 = view_29.transpose(1, 2) + view_29 = None + attn_output_27 = torch._C._nn.scaled_dot_product_attention( + query_layer_9, + key_layer_9, + value_layer_9, + attn_mask=extended_attention_mask, + dropout_p=0.0, + is_causal=False, + ) + query_layer_9 = key_layer_9 = value_layer_9 = None + attn_output_28 = attn_output_27.transpose(1, 2) + attn_output_27 = None + attn_output_29 = attn_output_28.reshape(1, 17, 768) + attn_output_28 = None + hidden_states_72 = torch._C._nn.linear( + attn_output_29, + l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_dense_parameters_bias_, + ) + attn_output_29 = l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_dense_parameters_bias_ = (None) + item_19 = ( + l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_dropout_p.item() + ) + l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_dropout_p = ( + None + ) + hidden_states_73 = torch.nn.functional.dropout( + hidden_states_72, item_19, False, False + ) + hidden_states_72 = item_19 = None + add_21 = hidden_states_73 + hidden_states_71 + hidden_states_73 = hidden_states_71 = None + hidden_states_74 = torch.nn.functional.layer_norm( + add_21, + (768,), + l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_layer_norm_parameters_bias_, + 1e-05, + ) + add_21 = l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = (None) + hidden_states_75 = torch._C._nn.linear( + hidden_states_74, + l_self_modules_encoder_modules_layer_modules_9_modules_intermediate_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_9_modules_intermediate_modules_dense_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_9_modules_intermediate_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_9_modules_intermediate_modules_dense_parameters_bias_ = (None) + hidden_states_76 = torch._C._nn.gelu(hidden_states_75) + hidden_states_75 = None + hidden_states_77 = torch._C._nn.linear( + hidden_states_76, + l_self_modules_encoder_modules_layer_modules_9_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_9_modules_output_modules_dense_parameters_bias_, + ) + hidden_states_76 = l_self_modules_encoder_modules_layer_modules_9_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_9_modules_output_modules_dense_parameters_bias_ = (None) + item_20 = ( + l_self_modules_encoder_modules_layer_modules_9_modules_output_modules_dropout_p.item() + ) + l_self_modules_encoder_modules_layer_modules_9_modules_output_modules_dropout_p = ( + None + ) + hidden_states_78 = torch.nn.functional.dropout( + hidden_states_77, item_20, False, False + ) + hidden_states_77 = item_20 = None + add_22 = hidden_states_78 + hidden_states_74 + hidden_states_78 = hidden_states_74 = None + hidden_states_79 = torch.nn.functional.layer_norm( + add_22, + (768,), + l_self_modules_encoder_modules_layer_modules_9_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_9_modules_output_modules_layer_norm_parameters_bias_, + 1e-05, + ) + add_22 = l_self_modules_encoder_modules_layer_modules_9_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_9_modules_output_modules_layer_norm_parameters_bias_ = (None) + linear_60 = torch._C._nn.linear( + hidden_states_79, + l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_query_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_query_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_query_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_query_parameters_bias_ = (None) + view_30 = linear_60.view(1, -1, 12, 64) + linear_60 = None + query_layer_10 = view_30.transpose(1, 2) + view_30 = None + linear_61 = torch._C._nn.linear( + hidden_states_79, + l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_key_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_key_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_key_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_key_parameters_bias_ = (None) + view_31 = linear_61.view(1, -1, 12, 64) + linear_61 = None + key_layer_10 = view_31.transpose(1, 2) + view_31 = None + linear_62 = torch._C._nn.linear( + hidden_states_79, + l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_value_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_value_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_value_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_value_parameters_bias_ = (None) + view_32 = linear_62.view(1, -1, 12, 64) + linear_62 = None + value_layer_10 = view_32.transpose(1, 2) + view_32 = None + attn_output_30 = torch._C._nn.scaled_dot_product_attention( + query_layer_10, + key_layer_10, + value_layer_10, + attn_mask=extended_attention_mask, + dropout_p=0.0, + is_causal=False, + ) + query_layer_10 = key_layer_10 = value_layer_10 = None + attn_output_31 = attn_output_30.transpose(1, 2) + attn_output_30 = None + attn_output_32 = attn_output_31.reshape(1, 17, 768) + attn_output_31 = None + hidden_states_80 = torch._C._nn.linear( + attn_output_32, + l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_dense_parameters_bias_, + ) + attn_output_32 = l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_dense_parameters_bias_ = (None) + item_21 = ( + l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_dropout_p.item() + ) + l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_dropout_p = ( + None + ) + hidden_states_81 = torch.nn.functional.dropout( + hidden_states_80, item_21, False, False + ) + hidden_states_80 = item_21 = None + add_23 = hidden_states_81 + hidden_states_79 + hidden_states_81 = hidden_states_79 = None + hidden_states_82 = torch.nn.functional.layer_norm( + add_23, + (768,), + l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_layer_norm_parameters_bias_, + 1e-05, + ) + add_23 = l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = (None) + hidden_states_83 = torch._C._nn.linear( + hidden_states_82, + l_self_modules_encoder_modules_layer_modules_10_modules_intermediate_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_10_modules_intermediate_modules_dense_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_10_modules_intermediate_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_10_modules_intermediate_modules_dense_parameters_bias_ = (None) + hidden_states_84 = torch._C._nn.gelu(hidden_states_83) + hidden_states_83 = None + hidden_states_85 = torch._C._nn.linear( + hidden_states_84, + l_self_modules_encoder_modules_layer_modules_10_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_10_modules_output_modules_dense_parameters_bias_, + ) + hidden_states_84 = l_self_modules_encoder_modules_layer_modules_10_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_10_modules_output_modules_dense_parameters_bias_ = (None) + item_22 = ( + l_self_modules_encoder_modules_layer_modules_10_modules_output_modules_dropout_p.item() + ) + l_self_modules_encoder_modules_layer_modules_10_modules_output_modules_dropout_p = ( + None + ) + hidden_states_86 = torch.nn.functional.dropout( + hidden_states_85, item_22, False, False + ) + hidden_states_85 = item_22 = None + add_24 = hidden_states_86 + hidden_states_82 + hidden_states_86 = hidden_states_82 = None + hidden_states_87 = torch.nn.functional.layer_norm( + add_24, + (768,), + l_self_modules_encoder_modules_layer_modules_10_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_10_modules_output_modules_layer_norm_parameters_bias_, + 1e-05, + ) + add_24 = l_self_modules_encoder_modules_layer_modules_10_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_10_modules_output_modules_layer_norm_parameters_bias_ = (None) + linear_66 = torch._C._nn.linear( + hidden_states_87, + l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_query_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_query_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_query_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_query_parameters_bias_ = (None) + view_33 = linear_66.view(1, -1, 12, 64) + linear_66 = None + query_layer_11 = view_33.transpose(1, 2) + view_33 = None + linear_67 = torch._C._nn.linear( + hidden_states_87, + l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_key_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_key_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_key_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_key_parameters_bias_ = (None) + view_34 = linear_67.view(1, -1, 12, 64) + linear_67 = None + key_layer_11 = view_34.transpose(1, 2) + view_34 = None + linear_68 = torch._C._nn.linear( + hidden_states_87, + l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_value_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_value_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_value_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_value_parameters_bias_ = (None) + view_35 = linear_68.view(1, -1, 12, 64) + linear_68 = None + value_layer_11 = view_35.transpose(1, 2) + view_35 = None + attn_output_33 = torch._C._nn.scaled_dot_product_attention( + query_layer_11, + key_layer_11, + value_layer_11, + attn_mask=extended_attention_mask, + dropout_p=0.0, + is_causal=False, + ) + query_layer_11 = key_layer_11 = value_layer_11 = extended_attention_mask = None + attn_output_34 = attn_output_33.transpose(1, 2) + attn_output_33 = None + attn_output_35 = attn_output_34.reshape(1, 17, 768) + attn_output_34 = None + hidden_states_88 = torch._C._nn.linear( + attn_output_35, + l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_dense_parameters_bias_, + ) + attn_output_35 = l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_dense_parameters_bias_ = (None) + item_23 = ( + l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_dropout_p.item() + ) + l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_dropout_p = ( + None + ) + hidden_states_89 = torch.nn.functional.dropout( + hidden_states_88, item_23, False, False + ) + hidden_states_88 = item_23 = None + add_25 = hidden_states_89 + hidden_states_87 + hidden_states_89 = hidden_states_87 = None + hidden_states_90 = torch.nn.functional.layer_norm( + add_25, + (768,), + l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_layer_norm_parameters_bias_, + 1e-05, + ) + add_25 = l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = (None) + hidden_states_91 = torch._C._nn.linear( + hidden_states_90, + l_self_modules_encoder_modules_layer_modules_11_modules_intermediate_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_11_modules_intermediate_modules_dense_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_11_modules_intermediate_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_11_modules_intermediate_modules_dense_parameters_bias_ = (None) + hidden_states_92 = torch._C._nn.gelu(hidden_states_91) + hidden_states_91 = None + hidden_states_93 = torch._C._nn.linear( + hidden_states_92, + l_self_modules_encoder_modules_layer_modules_11_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_11_modules_output_modules_dense_parameters_bias_, + ) + hidden_states_92 = l_self_modules_encoder_modules_layer_modules_11_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_11_modules_output_modules_dense_parameters_bias_ = (None) + item_24 = ( + l_self_modules_encoder_modules_layer_modules_11_modules_output_modules_dropout_p.item() + ) + l_self_modules_encoder_modules_layer_modules_11_modules_output_modules_dropout_p = ( + None + ) + hidden_states_94 = torch.nn.functional.dropout( + hidden_states_93, item_24, False, False + ) + hidden_states_93 = item_24 = None + add_26 = hidden_states_94 + hidden_states_90 + hidden_states_94 = hidden_states_90 = None + hidden_states_95 = torch.nn.functional.layer_norm( + add_26, + (768,), + l_self_modules_encoder_modules_layer_modules_11_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_11_modules_output_modules_layer_norm_parameters_bias_, + 1e-05, + ) + add_26 = l_self_modules_encoder_modules_layer_modules_11_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_11_modules_output_modules_layer_norm_parameters_bias_ = (None) + first_token_tensor = hidden_states_95[(slice(None, None, None), 0)] + pooled_output = torch._C._nn.linear( + first_token_tensor, + l_self_modules_pooler_modules_dense_parameters_weight_, + l_self_modules_pooler_modules_dense_parameters_bias_, + ) + first_token_tensor = ( + l_self_modules_pooler_modules_dense_parameters_weight_ + ) = l_self_modules_pooler_modules_dense_parameters_bias_ = None + pooled_output_1 = torch.tanh(pooled_output) + pooled_output = None + return (hidden_states_95, pooled_output_1) diff --git a/samples/transformers-auto-model/gonchisi_roberta-base-bne-finetuned-new_or_used-title/weight_meta.py b/samples/transformers-auto-model/gonchisi_roberta-base-bne-finetuned-new_or_used-title/weight_meta.py new file mode 100644 index 000000000..f2fbfe0e2 --- /dev/null +++ b/samples/transformers-auto-model/gonchisi_roberta-base-bne-finetuned-new_or_used-title/weight_meta.py @@ -0,0 +1,2287 @@ +class Program_weight_tensor_meta_L_input_ids_: + name = "L_input_ids_" + shape = [1, 17] + dtype = "torch.int64" + device = "cuda:0" + mean = None + std = None + data = [ + 0, + 16772, + 4198, + 24942, + 4100, + 747, + 423, + 449, + 16596, + 138, + 40050, + 9966, + 882, + 4198, + 137, + 68, + 2, + ] + + +class Program_weight_tensor_meta_L_self_modules_embeddings_buffers_token_type_ids_: + name = "L_self_modules_embeddings_buffers_token_type_ids_" + shape = [1, 514] + dtype = "torch.int64" + device = "cuda:0" + mean = None + std = None + min_val = 0 + max_val = 0 + + +class Program_weight_tensor_meta_L_self_modules_embeddings_modules_word_embeddings_parameters_weight_: + name = "L_self_modules_embeddings_modules_word_embeddings_parameters_weight_" + shape = [50262, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_embeddings_modules_token_type_embeddings_parameters_weight_: + name = "L_self_modules_embeddings_modules_token_type_embeddings_parameters_weight_" + shape = [1, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.001 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_embeddings_modules_position_embeddings_parameters_weight_: + name = "L_self_modules_embeddings_modules_position_embeddings_parameters_weight_" + shape = [514, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_embeddings_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_embeddings_modules_LayerNorm_parameters_weight_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_embeddings_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_embeddings_modules_LayerNorm_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_embeddings_modules_dropout_p: + name = "L_self_modules_embeddings_modules_dropout_p" + shape = [] + dtype = "torch.float64" + device = "cpu" + mean = 0.000 + std = 0.000 + data = [0.000000] + + +class Program_weight_tensor_meta_L_attention_mask_: + name = "L_attention_mask_" + shape = [1, 17] + dtype = "torch.int64" + device = "cuda:0" + mean = None + std = None + data = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_query_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_query_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_query_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_query_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_key_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_key_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_key_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_key_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_value_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_value_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_value_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_value_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dense_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dense_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dropout_p: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dropout_p" + shape = [] + dtype = "torch.float64" + device = "cpu" + mean = 0.000 + std = 0.000 + data = [0.000000] + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_LayerNorm_parameters_weight_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_LayerNorm_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_intermediate_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_intermediate_modules_dense_parameters_weight_" + shape = [3072, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_intermediate_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_intermediate_modules_dense_parameters_bias_" + shape = [3072] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dense_parameters_weight_" + shape = [768, 3072] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dense_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dropout_p: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dropout_p" + shape = [] + dtype = "torch.float64" + device = "cpu" + mean = 0.000 + std = 0.000 + data = [0.000000] + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_LayerNorm_parameters_weight_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_LayerNorm_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_query_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_query_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_query_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_query_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_key_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_key_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_key_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_key_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_value_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_value_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_value_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_value_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dense_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dense_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dropout_p: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dropout_p" + shape = [] + dtype = "torch.float64" + device = "cpu" + mean = 0.000 + std = 0.000 + data = [0.000000] + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_LayerNorm_parameters_weight_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_LayerNorm_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_intermediate_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_intermediate_modules_dense_parameters_weight_" + shape = [3072, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_intermediate_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_intermediate_modules_dense_parameters_bias_" + shape = [3072] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dense_parameters_weight_" + shape = [768, 3072] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dense_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dropout_p: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dropout_p" + shape = [] + dtype = "torch.float64" + device = "cpu" + mean = 0.000 + std = 0.000 + data = [0.000000] + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_LayerNorm_parameters_weight_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_LayerNorm_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_query_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_query_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_query_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_query_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_key_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_key_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_key_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_key_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_value_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_value_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_value_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_value_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_dense_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_dense_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_dropout_p: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_dropout_p" + shape = [] + dtype = "torch.float64" + device = "cpu" + mean = 0.000 + std = 0.000 + data = [0.000000] + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_LayerNorm_parameters_weight_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_LayerNorm_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_intermediate_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_intermediate_modules_dense_parameters_weight_" + shape = [3072, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_intermediate_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_intermediate_modules_dense_parameters_bias_" + shape = [3072] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_dense_parameters_weight_" + shape = [768, 3072] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_dense_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_dropout_p: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_dropout_p" + shape = [] + dtype = "torch.float64" + device = "cpu" + mean = 0.000 + std = 0.000 + data = [0.000000] + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_LayerNorm_parameters_weight_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_LayerNorm_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_query_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_query_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_query_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_query_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_key_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_key_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_key_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_key_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_value_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_value_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_value_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_value_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_dense_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_dense_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_dropout_p: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_dropout_p" + shape = [] + dtype = "torch.float64" + device = "cpu" + mean = 0.000 + std = 0.000 + data = [0.000000] + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_LayerNorm_parameters_weight_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_LayerNorm_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_intermediate_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_intermediate_modules_dense_parameters_weight_" + shape = [3072, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_intermediate_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_intermediate_modules_dense_parameters_bias_" + shape = [3072] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_dense_parameters_weight_" + shape = [768, 3072] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_dense_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_dropout_p: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_dropout_p" + shape = [] + dtype = "torch.float64" + device = "cpu" + mean = 0.000 + std = 0.000 + data = [0.000000] + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_LayerNorm_parameters_weight_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_LayerNorm_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_query_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_query_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_query_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_query_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_key_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_key_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_key_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_key_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_value_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_value_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_value_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_value_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_dense_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_dense_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_dropout_p: + name = "L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_dropout_p" + shape = [] + dtype = "torch.float64" + device = "cpu" + mean = 0.000 + std = 0.000 + data = [0.000000] + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_LayerNorm_parameters_weight_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_LayerNorm_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_4_modules_intermediate_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_4_modules_intermediate_modules_dense_parameters_weight_" + shape = [3072, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_4_modules_intermediate_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_4_modules_intermediate_modules_dense_parameters_bias_" + shape = [3072] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_4_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_4_modules_output_modules_dense_parameters_weight_" + shape = [768, 3072] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_4_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_4_modules_output_modules_dense_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_4_modules_output_modules_dropout_p: + name = "L_self_modules_encoder_modules_layer_modules_4_modules_output_modules_dropout_p" + shape = [] + dtype = "torch.float64" + device = "cpu" + mean = 0.000 + std = 0.000 + data = [0.000000] + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_4_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_4_modules_output_modules_LayerNorm_parameters_weight_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_4_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_4_modules_output_modules_LayerNorm_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_query_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_query_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_query_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_query_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_key_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_key_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_key_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_key_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_value_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_value_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_value_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_value_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_dense_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_dense_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_dropout_p: + name = "L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_dropout_p" + shape = [] + dtype = "torch.float64" + device = "cpu" + mean = 0.000 + std = 0.000 + data = [0.000000] + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_LayerNorm_parameters_weight_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_LayerNorm_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_5_modules_intermediate_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_5_modules_intermediate_modules_dense_parameters_weight_" + shape = [3072, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_5_modules_intermediate_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_5_modules_intermediate_modules_dense_parameters_bias_" + shape = [3072] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_5_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_5_modules_output_modules_dense_parameters_weight_" + shape = [768, 3072] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_5_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_5_modules_output_modules_dense_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_5_modules_output_modules_dropout_p: + name = "L_self_modules_encoder_modules_layer_modules_5_modules_output_modules_dropout_p" + shape = [] + dtype = "torch.float64" + device = "cpu" + mean = 0.000 + std = 0.000 + data = [0.000000] + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_5_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_5_modules_output_modules_LayerNorm_parameters_weight_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_5_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_5_modules_output_modules_LayerNorm_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_query_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_query_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_query_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_query_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_key_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_key_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_key_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_key_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_value_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_value_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_value_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_value_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_dense_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_dense_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_dropout_p: + name = "L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_dropout_p" + shape = [] + dtype = "torch.float64" + device = "cpu" + mean = 0.000 + std = 0.000 + data = [0.000000] + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_LayerNorm_parameters_weight_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_LayerNorm_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_6_modules_intermediate_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_6_modules_intermediate_modules_dense_parameters_weight_" + shape = [3072, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_6_modules_intermediate_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_6_modules_intermediate_modules_dense_parameters_bias_" + shape = [3072] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_6_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_6_modules_output_modules_dense_parameters_weight_" + shape = [768, 3072] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_6_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_6_modules_output_modules_dense_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_6_modules_output_modules_dropout_p: + name = "L_self_modules_encoder_modules_layer_modules_6_modules_output_modules_dropout_p" + shape = [] + dtype = "torch.float64" + device = "cpu" + mean = 0.000 + std = 0.000 + data = [0.000000] + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_6_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_6_modules_output_modules_LayerNorm_parameters_weight_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_6_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_6_modules_output_modules_LayerNorm_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_query_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_query_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_query_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_query_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_key_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_key_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_key_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_key_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_value_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_value_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_value_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_value_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_dense_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_dense_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_dropout_p: + name = "L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_dropout_p" + shape = [] + dtype = "torch.float64" + device = "cpu" + mean = 0.000 + std = 0.000 + data = [0.000000] + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_LayerNorm_parameters_weight_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_LayerNorm_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_7_modules_intermediate_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_7_modules_intermediate_modules_dense_parameters_weight_" + shape = [3072, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_7_modules_intermediate_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_7_modules_intermediate_modules_dense_parameters_bias_" + shape = [3072] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_7_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_7_modules_output_modules_dense_parameters_weight_" + shape = [768, 3072] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_7_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_7_modules_output_modules_dense_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_7_modules_output_modules_dropout_p: + name = "L_self_modules_encoder_modules_layer_modules_7_modules_output_modules_dropout_p" + shape = [] + dtype = "torch.float64" + device = "cpu" + mean = 0.000 + std = 0.000 + data = [0.000000] + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_7_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_7_modules_output_modules_LayerNorm_parameters_weight_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_7_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_7_modules_output_modules_LayerNorm_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_query_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_query_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_query_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_query_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_key_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_key_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_key_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_key_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_value_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_value_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_value_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_value_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_dense_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_dense_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_dropout_p: + name = "L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_dropout_p" + shape = [] + dtype = "torch.float64" + device = "cpu" + mean = 0.000 + std = 0.000 + data = [0.000000] + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_LayerNorm_parameters_weight_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_LayerNorm_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_8_modules_intermediate_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_8_modules_intermediate_modules_dense_parameters_weight_" + shape = [3072, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_8_modules_intermediate_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_8_modules_intermediate_modules_dense_parameters_bias_" + shape = [3072] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_8_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_8_modules_output_modules_dense_parameters_weight_" + shape = [768, 3072] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_8_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_8_modules_output_modules_dense_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_8_modules_output_modules_dropout_p: + name = "L_self_modules_encoder_modules_layer_modules_8_modules_output_modules_dropout_p" + shape = [] + dtype = "torch.float64" + device = "cpu" + mean = 0.000 + std = 0.000 + data = [0.000000] + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_8_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_8_modules_output_modules_LayerNorm_parameters_weight_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_8_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_8_modules_output_modules_LayerNorm_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_query_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_query_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_query_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_query_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_key_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_key_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_key_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_key_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_value_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_value_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_value_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_value_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_dense_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_dense_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_dropout_p: + name = "L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_dropout_p" + shape = [] + dtype = "torch.float64" + device = "cpu" + mean = 0.000 + std = 0.000 + data = [0.000000] + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_LayerNorm_parameters_weight_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_LayerNorm_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_9_modules_intermediate_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_9_modules_intermediate_modules_dense_parameters_weight_" + shape = [3072, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_9_modules_intermediate_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_9_modules_intermediate_modules_dense_parameters_bias_" + shape = [3072] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_9_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_9_modules_output_modules_dense_parameters_weight_" + shape = [768, 3072] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_9_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_9_modules_output_modules_dense_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_9_modules_output_modules_dropout_p: + name = "L_self_modules_encoder_modules_layer_modules_9_modules_output_modules_dropout_p" + shape = [] + dtype = "torch.float64" + device = "cpu" + mean = 0.000 + std = 0.000 + data = [0.000000] + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_9_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_9_modules_output_modules_LayerNorm_parameters_weight_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_9_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_9_modules_output_modules_LayerNorm_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_query_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_query_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_query_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_query_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_key_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_key_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_key_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_key_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_value_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_value_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_value_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_value_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_dense_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_dense_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_dropout_p: + name = "L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_dropout_p" + shape = [] + dtype = "torch.float64" + device = "cpu" + mean = 0.000 + std = 0.000 + data = [0.000000] + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_LayerNorm_parameters_weight_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_LayerNorm_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_10_modules_intermediate_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_10_modules_intermediate_modules_dense_parameters_weight_" + shape = [3072, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_10_modules_intermediate_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_10_modules_intermediate_modules_dense_parameters_bias_" + shape = [3072] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_10_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_10_modules_output_modules_dense_parameters_weight_" + shape = [768, 3072] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_10_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_10_modules_output_modules_dense_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_10_modules_output_modules_dropout_p: + name = "L_self_modules_encoder_modules_layer_modules_10_modules_output_modules_dropout_p" + shape = [] + dtype = "torch.float64" + device = "cpu" + mean = 0.000 + std = 0.000 + data = [0.000000] + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_10_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_10_modules_output_modules_LayerNorm_parameters_weight_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_10_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_10_modules_output_modules_LayerNorm_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_query_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_query_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_query_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_query_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_key_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_key_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_key_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_key_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_value_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_value_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_value_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_value_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_dense_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_dense_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_dropout_p: + name = "L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_dropout_p" + shape = [] + dtype = "torch.float64" + device = "cpu" + mean = 0.000 + std = 0.000 + data = [0.000000] + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_LayerNorm_parameters_weight_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_LayerNorm_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_11_modules_intermediate_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_11_modules_intermediate_modules_dense_parameters_weight_" + shape = [3072, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_11_modules_intermediate_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_11_modules_intermediate_modules_dense_parameters_bias_" + shape = [3072] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_11_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_11_modules_output_modules_dense_parameters_weight_" + shape = [768, 3072] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_11_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_11_modules_output_modules_dense_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_11_modules_output_modules_dropout_p: + name = "L_self_modules_encoder_modules_layer_modules_11_modules_output_modules_dropout_p" + shape = [] + dtype = "torch.float64" + device = "cpu" + mean = 0.000 + std = 0.000 + data = [0.000000] + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_11_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_11_modules_output_modules_LayerNorm_parameters_weight_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_11_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_11_modules_output_modules_LayerNorm_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_pooler_modules_dense_parameters_weight_: + name = "L_self_modules_pooler_modules_dense_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_pooler_modules_dense_parameters_bias_: + name = "L_self_modules_pooler_modules_dense_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None diff --git a/samples/transformers-auto-model/hossamamer12_BUS15100_MB2_20epoch_notweettokenizer_fp16/graph_hash.txt b/samples/transformers-auto-model/hossamamer12_BUS15100_MB2_20epoch_notweettokenizer_fp16/graph_hash.txt new file mode 100644 index 000000000..ea94e11f1 --- /dev/null +++ b/samples/transformers-auto-model/hossamamer12_BUS15100_MB2_20epoch_notweettokenizer_fp16/graph_hash.txt @@ -0,0 +1 @@ +048840c4bc9a42796e0efbe9055eac662dce4552dafd35edb82863e019056d61 \ No newline at end of file diff --git a/samples/transformers-auto-model/hossamamer12_BUS15100_MB2_20epoch_notweettokenizer_fp16/graph_net.json b/samples/transformers-auto-model/hossamamer12_BUS15100_MB2_20epoch_notweettokenizer_fp16/graph_net.json new file mode 100644 index 000000000..b6ffe9f72 --- /dev/null +++ b/samples/transformers-auto-model/hossamamer12_BUS15100_MB2_20epoch_notweettokenizer_fp16/graph_net.json @@ -0,0 +1,6 @@ +{ + "framework": "torch", + "num_devices_required": 1, + "num_nodes_required": 1, + "dynamic": false +} \ No newline at end of file diff --git a/samples/transformers-auto-model/hossamamer12_BUS15100_MB2_20epoch_notweettokenizer_fp16/input_meta.py b/samples/transformers-auto-model/hossamamer12_BUS15100_MB2_20epoch_notweettokenizer_fp16/input_meta.py new file mode 100644 index 000000000..e69de29bb diff --git a/samples/transformers-auto-model/hossamamer12_BUS15100_MB2_20epoch_notweettokenizer_fp16/input_tensor_constraints.py b/samples/transformers-auto-model/hossamamer12_BUS15100_MB2_20epoch_notweettokenizer_fp16/input_tensor_constraints.py new file mode 100644 index 000000000..e69de29bb diff --git a/samples/transformers-auto-model/hossamamer12_BUS15100_MB2_20epoch_notweettokenizer_fp16/model.py b/samples/transformers-auto-model/hossamamer12_BUS15100_MB2_20epoch_notweettokenizer_fp16/model.py new file mode 100644 index 000000000..9cf73c74b --- /dev/null +++ b/samples/transformers-auto-model/hossamamer12_BUS15100_MB2_20epoch_notweettokenizer_fp16/model.py @@ -0,0 +1,789 @@ +import torch + + +class GraphModule(torch.nn.Module): + def forward( + self, + L_input_ids_: torch.Tensor, + L_attention_mask_: torch.Tensor, + L_token_type_ids_: torch.Tensor, + L_self_modules_embeddings_modules_word_embeddings_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_embeddings_buffers_position_ids_: torch.Tensor, + L_self_modules_embeddings_modules_embedding_transformation_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_embeddings_modules_embedding_transformation_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_embeddings_modules_position_embeddings_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_embeddings_modules_token_type_embeddings_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_embeddings_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_embeddings_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_bottleneck_modules_input_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_bottleneck_modules_input_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_bottleneck_modules_input_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_bottleneck_modules_input_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_bottleneck_modules_attention_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_bottleneck_modules_attention_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_bottleneck_modules_attention_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_bottleneck_modules_attention_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_query_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_query_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_key_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_key_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_ffn_modules_0_modules_intermediate_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_ffn_modules_0_modules_intermediate_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_ffn_modules_0_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_ffn_modules_0_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_ffn_modules_0_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_ffn_modules_0_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_ffn_modules_1_modules_intermediate_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_ffn_modules_1_modules_intermediate_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_ffn_modules_1_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_ffn_modules_1_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_ffn_modules_1_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_ffn_modules_1_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_ffn_modules_2_modules_intermediate_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_ffn_modules_2_modules_intermediate_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_ffn_modules_2_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_ffn_modules_2_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_ffn_modules_2_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_ffn_modules_2_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_intermediate_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_intermediate_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_bottleneck_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_bottleneck_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_bottleneck_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_bottleneck_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_bottleneck_modules_input_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_bottleneck_modules_input_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_bottleneck_modules_input_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_bottleneck_modules_input_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_bottleneck_modules_attention_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_bottleneck_modules_attention_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_bottleneck_modules_attention_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_bottleneck_modules_attention_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_query_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_query_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_key_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_key_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_ffn_modules_0_modules_intermediate_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_ffn_modules_0_modules_intermediate_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_ffn_modules_0_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_ffn_modules_0_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_ffn_modules_0_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_ffn_modules_0_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_ffn_modules_1_modules_intermediate_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_ffn_modules_1_modules_intermediate_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_ffn_modules_1_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_ffn_modules_1_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_ffn_modules_1_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_ffn_modules_1_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_ffn_modules_2_modules_intermediate_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_ffn_modules_2_modules_intermediate_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_ffn_modules_2_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_ffn_modules_2_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_ffn_modules_2_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_ffn_modules_2_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_intermediate_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_intermediate_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_bottleneck_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_bottleneck_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_bottleneck_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_bottleneck_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_pooler_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_pooler_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + ): + l_input_ids_ = L_input_ids_ + l_attention_mask_ = L_attention_mask_ + l_token_type_ids_ = L_token_type_ids_ + l_self_modules_embeddings_modules_word_embeddings_parameters_weight_ = ( + L_self_modules_embeddings_modules_word_embeddings_parameters_weight_ + ) + l_self_modules_embeddings_buffers_position_ids_ = ( + L_self_modules_embeddings_buffers_position_ids_ + ) + l_self_modules_embeddings_modules_embedding_transformation_parameters_weight_ = L_self_modules_embeddings_modules_embedding_transformation_parameters_weight_ + l_self_modules_embeddings_modules_embedding_transformation_parameters_bias_ = ( + L_self_modules_embeddings_modules_embedding_transformation_parameters_bias_ + ) + l_self_modules_embeddings_modules_position_embeddings_parameters_weight_ = ( + L_self_modules_embeddings_modules_position_embeddings_parameters_weight_ + ) + l_self_modules_embeddings_modules_token_type_embeddings_parameters_weight_ = ( + L_self_modules_embeddings_modules_token_type_embeddings_parameters_weight_ + ) + l_self_modules_embeddings_modules_layer_norm_parameters_weight_ = ( + L_self_modules_embeddings_modules_LayerNorm_parameters_weight_ + ) + l_self_modules_embeddings_modules_layer_norm_parameters_bias_ = ( + L_self_modules_embeddings_modules_LayerNorm_parameters_bias_ + ) + l_self_modules_encoder_modules_layer_modules_0_modules_bottleneck_modules_input_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_0_modules_bottleneck_modules_input_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_0_modules_bottleneck_modules_input_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_0_modules_bottleneck_modules_input_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_0_modules_bottleneck_modules_input_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_0_modules_bottleneck_modules_input_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_0_modules_bottleneck_modules_input_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_0_modules_bottleneck_modules_input_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_0_modules_bottleneck_modules_attention_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_0_modules_bottleneck_modules_attention_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_0_modules_bottleneck_modules_attention_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_0_modules_bottleneck_modules_attention_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_0_modules_bottleneck_modules_attention_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_0_modules_bottleneck_modules_attention_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_0_modules_bottleneck_modules_attention_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_0_modules_bottleneck_modules_attention_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_query_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_query_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_query_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_query_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_key_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_key_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_key_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_key_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_value_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_value_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_value_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_value_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_0_modules_ffn_modules_0_modules_intermediate_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_0_modules_ffn_modules_0_modules_intermediate_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_0_modules_ffn_modules_0_modules_intermediate_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_0_modules_ffn_modules_0_modules_intermediate_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_0_modules_ffn_modules_0_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_0_modules_ffn_modules_0_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_0_modules_ffn_modules_0_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_0_modules_ffn_modules_0_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_0_modules_ffn_modules_0_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_0_modules_ffn_modules_0_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_0_modules_ffn_modules_0_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_0_modules_ffn_modules_0_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_0_modules_ffn_modules_1_modules_intermediate_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_0_modules_ffn_modules_1_modules_intermediate_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_0_modules_ffn_modules_1_modules_intermediate_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_0_modules_ffn_modules_1_modules_intermediate_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_0_modules_ffn_modules_1_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_0_modules_ffn_modules_1_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_0_modules_ffn_modules_1_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_0_modules_ffn_modules_1_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_0_modules_ffn_modules_1_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_0_modules_ffn_modules_1_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_0_modules_ffn_modules_1_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_0_modules_ffn_modules_1_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_0_modules_ffn_modules_2_modules_intermediate_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_0_modules_ffn_modules_2_modules_intermediate_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_0_modules_ffn_modules_2_modules_intermediate_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_0_modules_ffn_modules_2_modules_intermediate_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_0_modules_ffn_modules_2_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_0_modules_ffn_modules_2_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_0_modules_ffn_modules_2_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_0_modules_ffn_modules_2_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_0_modules_ffn_modules_2_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_0_modules_ffn_modules_2_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_0_modules_ffn_modules_2_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_0_modules_ffn_modules_2_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_0_modules_intermediate_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_0_modules_intermediate_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_0_modules_intermediate_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_0_modules_intermediate_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_0_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_0_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_0_modules_output_modules_bottleneck_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_bottleneck_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_0_modules_output_modules_bottleneck_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_bottleneck_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_0_modules_output_modules_bottleneck_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_bottleneck_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_0_modules_output_modules_bottleneck_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_bottleneck_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_1_modules_bottleneck_modules_input_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_1_modules_bottleneck_modules_input_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_1_modules_bottleneck_modules_input_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_1_modules_bottleneck_modules_input_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_1_modules_bottleneck_modules_input_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_1_modules_bottleneck_modules_input_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_1_modules_bottleneck_modules_input_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_1_modules_bottleneck_modules_input_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_1_modules_bottleneck_modules_attention_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_1_modules_bottleneck_modules_attention_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_1_modules_bottleneck_modules_attention_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_1_modules_bottleneck_modules_attention_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_1_modules_bottleneck_modules_attention_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_1_modules_bottleneck_modules_attention_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_1_modules_bottleneck_modules_attention_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_1_modules_bottleneck_modules_attention_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_query_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_query_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_query_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_query_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_key_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_key_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_key_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_key_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_value_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_value_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_value_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_value_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_1_modules_ffn_modules_0_modules_intermediate_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_1_modules_ffn_modules_0_modules_intermediate_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_1_modules_ffn_modules_0_modules_intermediate_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_1_modules_ffn_modules_0_modules_intermediate_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_1_modules_ffn_modules_0_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_1_modules_ffn_modules_0_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_1_modules_ffn_modules_0_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_1_modules_ffn_modules_0_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_1_modules_ffn_modules_0_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_1_modules_ffn_modules_0_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_1_modules_ffn_modules_0_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_1_modules_ffn_modules_0_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_1_modules_ffn_modules_1_modules_intermediate_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_1_modules_ffn_modules_1_modules_intermediate_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_1_modules_ffn_modules_1_modules_intermediate_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_1_modules_ffn_modules_1_modules_intermediate_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_1_modules_ffn_modules_1_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_1_modules_ffn_modules_1_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_1_modules_ffn_modules_1_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_1_modules_ffn_modules_1_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_1_modules_ffn_modules_1_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_1_modules_ffn_modules_1_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_1_modules_ffn_modules_1_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_1_modules_ffn_modules_1_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_1_modules_ffn_modules_2_modules_intermediate_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_1_modules_ffn_modules_2_modules_intermediate_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_1_modules_ffn_modules_2_modules_intermediate_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_1_modules_ffn_modules_2_modules_intermediate_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_1_modules_ffn_modules_2_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_1_modules_ffn_modules_2_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_1_modules_ffn_modules_2_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_1_modules_ffn_modules_2_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_1_modules_ffn_modules_2_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_1_modules_ffn_modules_2_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_1_modules_ffn_modules_2_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_1_modules_ffn_modules_2_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_1_modules_intermediate_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_1_modules_intermediate_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_1_modules_intermediate_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_1_modules_intermediate_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_1_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_1_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_1_modules_output_modules_bottleneck_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_bottleneck_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_1_modules_output_modules_bottleneck_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_bottleneck_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_1_modules_output_modules_bottleneck_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_bottleneck_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_1_modules_output_modules_bottleneck_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_bottleneck_modules_LayerNorm_parameters_bias_ + l_self_modules_pooler_modules_dense_parameters_weight_ = ( + L_self_modules_pooler_modules_dense_parameters_weight_ + ) + l_self_modules_pooler_modules_dense_parameters_bias_ = ( + L_self_modules_pooler_modules_dense_parameters_bias_ + ) + extended_attention_mask = l_attention_mask_[ + (slice(None, None, None), None, None, slice(None, None, None)) + ] + l_attention_mask_ = None + extended_attention_mask_1 = extended_attention_mask.to(dtype=torch.float32) + extended_attention_mask = None + sub = 1.0 - extended_attention_mask_1 + extended_attention_mask_1 = None + extended_attention_mask_2 = sub * -3.4028234663852886e38 + sub = None + position_ids = l_self_modules_embeddings_buffers_position_ids_[ + (slice(None, None, None), slice(None, 20, None)) + ] + l_self_modules_embeddings_buffers_position_ids_ = None + inputs_embeds = torch.nn.functional.embedding( + l_input_ids_, + l_self_modules_embeddings_modules_word_embeddings_parameters_weight_, + 0, + None, + 2.0, + False, + False, + ) + l_input_ids_ = ( + l_self_modules_embeddings_modules_word_embeddings_parameters_weight_ + ) = None + getitem_2 = inputs_embeds[(slice(None, None, None), slice(1, None, None))] + pad = torch._C._nn.pad(getitem_2, [0, 0, 0, 1, 0, 0], "constant", 0.0) + getitem_2 = None + getitem_3 = inputs_embeds[(slice(None, None, None), slice(None, -1, None))] + pad_1 = torch._C._nn.pad(getitem_3, [0, 0, 1, 0, 0, 0], "constant", 0.0) + getitem_3 = None + inputs_embeds_1 = torch.cat([pad, inputs_embeds, pad_1], dim=2) + pad = inputs_embeds = pad_1 = None + inputs_embeds_2 = torch._C._nn.linear( + inputs_embeds_1, + l_self_modules_embeddings_modules_embedding_transformation_parameters_weight_, + l_self_modules_embeddings_modules_embedding_transformation_parameters_bias_, + ) + inputs_embeds_1 = l_self_modules_embeddings_modules_embedding_transformation_parameters_weight_ = ( + l_self_modules_embeddings_modules_embedding_transformation_parameters_bias_ + ) = None + position_embeddings = torch.nn.functional.embedding( + position_ids, + l_self_modules_embeddings_modules_position_embeddings_parameters_weight_, + None, + None, + 2.0, + False, + False, + ) + position_ids = ( + l_self_modules_embeddings_modules_position_embeddings_parameters_weight_ + ) = None + token_type_embeddings = torch.nn.functional.embedding( + l_token_type_ids_, + l_self_modules_embeddings_modules_token_type_embeddings_parameters_weight_, + None, + None, + 2.0, + False, + False, + ) + l_token_type_ids_ = ( + l_self_modules_embeddings_modules_token_type_embeddings_parameters_weight_ + ) = None + add = inputs_embeds_2 + position_embeddings + inputs_embeds_2 = position_embeddings = None + embeddings = add + token_type_embeddings + add = token_type_embeddings = None + mul_1 = ( + embeddings * l_self_modules_embeddings_modules_layer_norm_parameters_weight_ + ) + embeddings = ( + l_self_modules_embeddings_modules_layer_norm_parameters_weight_ + ) = None + embeddings_1 = ( + mul_1 + l_self_modules_embeddings_modules_layer_norm_parameters_bias_ + ) + mul_1 = l_self_modules_embeddings_modules_layer_norm_parameters_bias_ = None + embeddings_2 = torch.nn.functional.dropout(embeddings_1, 0.0, False, False) + embeddings_1 = None + layer_input = torch._C._nn.linear( + embeddings_2, + l_self_modules_encoder_modules_layer_modules_0_modules_bottleneck_modules_input_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_0_modules_bottleneck_modules_input_modules_dense_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_0_modules_bottleneck_modules_input_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_0_modules_bottleneck_modules_input_modules_dense_parameters_bias_ = (None) + mul_2 = ( + layer_input + * l_self_modules_encoder_modules_layer_modules_0_modules_bottleneck_modules_input_modules_layer_norm_parameters_weight_ + ) + layer_input = l_self_modules_encoder_modules_layer_modules_0_modules_bottleneck_modules_input_modules_layer_norm_parameters_weight_ = (None) + layer_input_1 = ( + mul_2 + + l_self_modules_encoder_modules_layer_modules_0_modules_bottleneck_modules_input_modules_layer_norm_parameters_bias_ + ) + mul_2 = l_self_modules_encoder_modules_layer_modules_0_modules_bottleneck_modules_input_modules_layer_norm_parameters_bias_ = (None) + layer_input_2 = torch._C._nn.linear( + embeddings_2, + l_self_modules_encoder_modules_layer_modules_0_modules_bottleneck_modules_attention_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_0_modules_bottleneck_modules_attention_modules_dense_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_0_modules_bottleneck_modules_attention_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_0_modules_bottleneck_modules_attention_modules_dense_parameters_bias_ = (None) + mul_3 = ( + layer_input_2 + * l_self_modules_encoder_modules_layer_modules_0_modules_bottleneck_modules_attention_modules_layer_norm_parameters_weight_ + ) + layer_input_2 = l_self_modules_encoder_modules_layer_modules_0_modules_bottleneck_modules_attention_modules_layer_norm_parameters_weight_ = (None) + layer_input_3 = ( + mul_3 + + l_self_modules_encoder_modules_layer_modules_0_modules_bottleneck_modules_attention_modules_layer_norm_parameters_bias_ + ) + mul_3 = l_self_modules_encoder_modules_layer_modules_0_modules_bottleneck_modules_attention_modules_layer_norm_parameters_bias_ = (None) + linear_3 = torch._C._nn.linear( + layer_input_3, + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_query_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_query_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_query_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_query_parameters_bias_ = (None) + view = linear_3.view(1, -1, 4, 32) + linear_3 = None + query_layer = view.transpose(1, 2) + view = None + linear_4 = torch._C._nn.linear( + layer_input_3, + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_key_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_key_parameters_bias_, + ) + layer_input_3 = l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_key_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_key_parameters_bias_ = (None) + view_1 = linear_4.view(1, -1, 4, 32) + linear_4 = None + key_layer = view_1.transpose(1, 2) + view_1 = None + linear_5 = torch._C._nn.linear( + embeddings_2, + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_value_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_value_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_value_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_value_parameters_bias_ = (None) + view_2 = linear_5.view(1, -1, 4, 32) + linear_5 = None + value_layer = view_2.transpose(1, 2) + view_2 = None + transpose_3 = key_layer.transpose(-1, -2) + key_layer = None + attention_scores = torch.matmul(query_layer, transpose_3) + query_layer = transpose_3 = None + attention_scores_1 = attention_scores / 5.656854249492381 + attention_scores = None + attention_scores_2 = attention_scores_1 + extended_attention_mask_2 + attention_scores_1 = None + attention_probs = torch.nn.functional.softmax(attention_scores_2, dim=-1) + attention_scores_2 = None + attention_probs_1 = torch.nn.functional.dropout( + attention_probs, 0.1, False, False + ) + attention_probs = None + context_layer = torch.matmul(attention_probs_1, value_layer) + attention_probs_1 = value_layer = None + permute = context_layer.permute(0, 2, 1, 3) + context_layer = None + context_layer_1 = permute.contiguous() + permute = None + context_layer_2 = context_layer_1.view((1, 20, 128)) + context_layer_1 = None + layer_outputs = torch._C._nn.linear( + context_layer_2, + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dense_parameters_bias_, + ) + context_layer_2 = l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dense_parameters_bias_ = (None) + add_6 = layer_outputs + layer_input_1 + layer_outputs = layer_input_1 = None + mul_4 = ( + add_6 + * l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_layer_norm_parameters_weight_ + ) + add_6 = l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = (None) + layer_outputs_1 = ( + mul_4 + + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_layer_norm_parameters_bias_ + ) + mul_4 = l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = (None) + hidden_states = torch._C._nn.linear( + layer_outputs_1, + l_self_modules_encoder_modules_layer_modules_0_modules_ffn_modules_0_modules_intermediate_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_0_modules_ffn_modules_0_modules_intermediate_modules_dense_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_0_modules_ffn_modules_0_modules_intermediate_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_0_modules_ffn_modules_0_modules_intermediate_modules_dense_parameters_bias_ = (None) + hidden_states_1 = torch.nn.functional.relu(hidden_states, inplace=False) + hidden_states = None + layer_outputs_2 = torch._C._nn.linear( + hidden_states_1, + l_self_modules_encoder_modules_layer_modules_0_modules_ffn_modules_0_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_0_modules_ffn_modules_0_modules_output_modules_dense_parameters_bias_, + ) + hidden_states_1 = l_self_modules_encoder_modules_layer_modules_0_modules_ffn_modules_0_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_0_modules_ffn_modules_0_modules_output_modules_dense_parameters_bias_ = (None) + add_8 = layer_outputs_2 + layer_outputs_1 + layer_outputs_2 = layer_outputs_1 = None + mul_5 = ( + add_8 + * l_self_modules_encoder_modules_layer_modules_0_modules_ffn_modules_0_modules_output_modules_layer_norm_parameters_weight_ + ) + add_8 = l_self_modules_encoder_modules_layer_modules_0_modules_ffn_modules_0_modules_output_modules_layer_norm_parameters_weight_ = (None) + layer_outputs_3 = ( + mul_5 + + l_self_modules_encoder_modules_layer_modules_0_modules_ffn_modules_0_modules_output_modules_layer_norm_parameters_bias_ + ) + mul_5 = l_self_modules_encoder_modules_layer_modules_0_modules_ffn_modules_0_modules_output_modules_layer_norm_parameters_bias_ = (None) + hidden_states_2 = torch._C._nn.linear( + layer_outputs_3, + l_self_modules_encoder_modules_layer_modules_0_modules_ffn_modules_1_modules_intermediate_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_0_modules_ffn_modules_1_modules_intermediate_modules_dense_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_0_modules_ffn_modules_1_modules_intermediate_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_0_modules_ffn_modules_1_modules_intermediate_modules_dense_parameters_bias_ = (None) + hidden_states_3 = torch.nn.functional.relu(hidden_states_2, inplace=False) + hidden_states_2 = None + layer_outputs_4 = torch._C._nn.linear( + hidden_states_3, + l_self_modules_encoder_modules_layer_modules_0_modules_ffn_modules_1_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_0_modules_ffn_modules_1_modules_output_modules_dense_parameters_bias_, + ) + hidden_states_3 = l_self_modules_encoder_modules_layer_modules_0_modules_ffn_modules_1_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_0_modules_ffn_modules_1_modules_output_modules_dense_parameters_bias_ = (None) + add_10 = layer_outputs_4 + layer_outputs_3 + layer_outputs_4 = layer_outputs_3 = None + mul_6 = ( + add_10 + * l_self_modules_encoder_modules_layer_modules_0_modules_ffn_modules_1_modules_output_modules_layer_norm_parameters_weight_ + ) + add_10 = l_self_modules_encoder_modules_layer_modules_0_modules_ffn_modules_1_modules_output_modules_layer_norm_parameters_weight_ = (None) + layer_outputs_5 = ( + mul_6 + + l_self_modules_encoder_modules_layer_modules_0_modules_ffn_modules_1_modules_output_modules_layer_norm_parameters_bias_ + ) + mul_6 = l_self_modules_encoder_modules_layer_modules_0_modules_ffn_modules_1_modules_output_modules_layer_norm_parameters_bias_ = (None) + hidden_states_4 = torch._C._nn.linear( + layer_outputs_5, + l_self_modules_encoder_modules_layer_modules_0_modules_ffn_modules_2_modules_intermediate_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_0_modules_ffn_modules_2_modules_intermediate_modules_dense_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_0_modules_ffn_modules_2_modules_intermediate_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_0_modules_ffn_modules_2_modules_intermediate_modules_dense_parameters_bias_ = (None) + hidden_states_5 = torch.nn.functional.relu(hidden_states_4, inplace=False) + hidden_states_4 = None + layer_outputs_6 = torch._C._nn.linear( + hidden_states_5, + l_self_modules_encoder_modules_layer_modules_0_modules_ffn_modules_2_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_0_modules_ffn_modules_2_modules_output_modules_dense_parameters_bias_, + ) + hidden_states_5 = l_self_modules_encoder_modules_layer_modules_0_modules_ffn_modules_2_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_0_modules_ffn_modules_2_modules_output_modules_dense_parameters_bias_ = (None) + add_12 = layer_outputs_6 + layer_outputs_5 + layer_outputs_6 = layer_outputs_5 = None + mul_7 = ( + add_12 + * l_self_modules_encoder_modules_layer_modules_0_modules_ffn_modules_2_modules_output_modules_layer_norm_parameters_weight_ + ) + add_12 = l_self_modules_encoder_modules_layer_modules_0_modules_ffn_modules_2_modules_output_modules_layer_norm_parameters_weight_ = (None) + layer_outputs_7 = ( + mul_7 + + l_self_modules_encoder_modules_layer_modules_0_modules_ffn_modules_2_modules_output_modules_layer_norm_parameters_bias_ + ) + mul_7 = l_self_modules_encoder_modules_layer_modules_0_modules_ffn_modules_2_modules_output_modules_layer_norm_parameters_bias_ = (None) + hidden_states_6 = torch._C._nn.linear( + layer_outputs_7, + l_self_modules_encoder_modules_layer_modules_0_modules_intermediate_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_0_modules_intermediate_modules_dense_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_0_modules_intermediate_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_0_modules_intermediate_modules_dense_parameters_bias_ = (None) + hidden_states_7 = torch.nn.functional.relu(hidden_states_6, inplace=False) + hidden_states_6 = None + layer_output = torch._C._nn.linear( + hidden_states_7, + l_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dense_parameters_bias_, + ) + hidden_states_7 = l_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dense_parameters_bias_ = (None) + add_14 = layer_output + layer_outputs_7 + layer_output = layer_outputs_7 = None + mul_8 = ( + add_14 + * l_self_modules_encoder_modules_layer_modules_0_modules_output_modules_layer_norm_parameters_weight_ + ) + add_14 = l_self_modules_encoder_modules_layer_modules_0_modules_output_modules_layer_norm_parameters_weight_ = (None) + layer_output_1 = ( + mul_8 + + l_self_modules_encoder_modules_layer_modules_0_modules_output_modules_layer_norm_parameters_bias_ + ) + mul_8 = l_self_modules_encoder_modules_layer_modules_0_modules_output_modules_layer_norm_parameters_bias_ = (None) + layer_outputs_8 = torch._C._nn.linear( + layer_output_1, + l_self_modules_encoder_modules_layer_modules_0_modules_output_modules_bottleneck_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_0_modules_output_modules_bottleneck_modules_dense_parameters_bias_, + ) + layer_output_1 = l_self_modules_encoder_modules_layer_modules_0_modules_output_modules_bottleneck_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_0_modules_output_modules_bottleneck_modules_dense_parameters_bias_ = (None) + layer_outputs_9 = torch.nn.functional.dropout( + layer_outputs_8, 0.0, False, False + ) + layer_outputs_8 = None + add_16 = layer_outputs_9 + embeddings_2 + layer_outputs_9 = embeddings_2 = None + mul_9 = ( + add_16 + * l_self_modules_encoder_modules_layer_modules_0_modules_output_modules_bottleneck_modules_layer_norm_parameters_weight_ + ) + add_16 = l_self_modules_encoder_modules_layer_modules_0_modules_output_modules_bottleneck_modules_layer_norm_parameters_weight_ = (None) + layer_outputs_10 = ( + mul_9 + + l_self_modules_encoder_modules_layer_modules_0_modules_output_modules_bottleneck_modules_layer_norm_parameters_bias_ + ) + mul_9 = l_self_modules_encoder_modules_layer_modules_0_modules_output_modules_bottleneck_modules_layer_norm_parameters_bias_ = (None) + tensor = torch.tensor(1000) + tensor = None + layer_input_4 = torch._C._nn.linear( + layer_outputs_10, + l_self_modules_encoder_modules_layer_modules_1_modules_bottleneck_modules_input_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_1_modules_bottleneck_modules_input_modules_dense_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_1_modules_bottleneck_modules_input_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_1_modules_bottleneck_modules_input_modules_dense_parameters_bias_ = (None) + mul_10 = ( + layer_input_4 + * l_self_modules_encoder_modules_layer_modules_1_modules_bottleneck_modules_input_modules_layer_norm_parameters_weight_ + ) + layer_input_4 = l_self_modules_encoder_modules_layer_modules_1_modules_bottleneck_modules_input_modules_layer_norm_parameters_weight_ = (None) + layer_input_5 = ( + mul_10 + + l_self_modules_encoder_modules_layer_modules_1_modules_bottleneck_modules_input_modules_layer_norm_parameters_bias_ + ) + mul_10 = l_self_modules_encoder_modules_layer_modules_1_modules_bottleneck_modules_input_modules_layer_norm_parameters_bias_ = (None) + layer_input_6 = torch._C._nn.linear( + layer_outputs_10, + l_self_modules_encoder_modules_layer_modules_1_modules_bottleneck_modules_attention_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_1_modules_bottleneck_modules_attention_modules_dense_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_1_modules_bottleneck_modules_attention_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_1_modules_bottleneck_modules_attention_modules_dense_parameters_bias_ = (None) + mul_11 = ( + layer_input_6 + * l_self_modules_encoder_modules_layer_modules_1_modules_bottleneck_modules_attention_modules_layer_norm_parameters_weight_ + ) + layer_input_6 = l_self_modules_encoder_modules_layer_modules_1_modules_bottleneck_modules_attention_modules_layer_norm_parameters_weight_ = (None) + layer_input_7 = ( + mul_11 + + l_self_modules_encoder_modules_layer_modules_1_modules_bottleneck_modules_attention_modules_layer_norm_parameters_bias_ + ) + mul_11 = l_self_modules_encoder_modules_layer_modules_1_modules_bottleneck_modules_attention_modules_layer_norm_parameters_bias_ = (None) + linear_18 = torch._C._nn.linear( + layer_input_7, + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_query_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_query_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_query_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_query_parameters_bias_ = (None) + view_4 = linear_18.view(1, -1, 4, 32) + linear_18 = None + query_layer_1 = view_4.transpose(1, 2) + view_4 = None + linear_19 = torch._C._nn.linear( + layer_input_7, + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_key_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_key_parameters_bias_, + ) + layer_input_7 = l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_key_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_key_parameters_bias_ = (None) + view_5 = linear_19.view(1, -1, 4, 32) + linear_19 = None + key_layer_1 = view_5.transpose(1, 2) + view_5 = None + linear_20 = torch._C._nn.linear( + layer_outputs_10, + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_value_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_value_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_value_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_value_parameters_bias_ = (None) + view_6 = linear_20.view(1, -1, 4, 32) + linear_20 = None + value_layer_1 = view_6.transpose(1, 2) + view_6 = None + transpose_7 = key_layer_1.transpose(-1, -2) + key_layer_1 = None + attention_scores_3 = torch.matmul(query_layer_1, transpose_7) + query_layer_1 = transpose_7 = None + attention_scores_4 = attention_scores_3 / 5.656854249492381 + attention_scores_3 = None + attention_scores_5 = attention_scores_4 + extended_attention_mask_2 + attention_scores_4 = extended_attention_mask_2 = None + attention_probs_2 = torch.nn.functional.softmax(attention_scores_5, dim=-1) + attention_scores_5 = None + attention_probs_3 = torch.nn.functional.dropout( + attention_probs_2, 0.1, False, False + ) + attention_probs_2 = None + context_layer_3 = torch.matmul(attention_probs_3, value_layer_1) + attention_probs_3 = value_layer_1 = None + permute_1 = context_layer_3.permute(0, 2, 1, 3) + context_layer_3 = None + context_layer_4 = permute_1.contiguous() + permute_1 = None + context_layer_5 = context_layer_4.view((1, 20, 128)) + context_layer_4 = None + layer_outputs_11 = torch._C._nn.linear( + context_layer_5, + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dense_parameters_bias_, + ) + context_layer_5 = l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dense_parameters_bias_ = (None) + add_21 = layer_outputs_11 + layer_input_5 + layer_outputs_11 = layer_input_5 = None + mul_12 = ( + add_21 + * l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_layer_norm_parameters_weight_ + ) + add_21 = l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = (None) + layer_outputs_12 = ( + mul_12 + + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_layer_norm_parameters_bias_ + ) + mul_12 = l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = (None) + hidden_states_8 = torch._C._nn.linear( + layer_outputs_12, + l_self_modules_encoder_modules_layer_modules_1_modules_ffn_modules_0_modules_intermediate_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_1_modules_ffn_modules_0_modules_intermediate_modules_dense_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_1_modules_ffn_modules_0_modules_intermediate_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_1_modules_ffn_modules_0_modules_intermediate_modules_dense_parameters_bias_ = (None) + hidden_states_9 = torch.nn.functional.relu(hidden_states_8, inplace=False) + hidden_states_8 = None + layer_outputs_13 = torch._C._nn.linear( + hidden_states_9, + l_self_modules_encoder_modules_layer_modules_1_modules_ffn_modules_0_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_1_modules_ffn_modules_0_modules_output_modules_dense_parameters_bias_, + ) + hidden_states_9 = l_self_modules_encoder_modules_layer_modules_1_modules_ffn_modules_0_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_1_modules_ffn_modules_0_modules_output_modules_dense_parameters_bias_ = (None) + add_23 = layer_outputs_13 + layer_outputs_12 + layer_outputs_13 = layer_outputs_12 = None + mul_13 = ( + add_23 + * l_self_modules_encoder_modules_layer_modules_1_modules_ffn_modules_0_modules_output_modules_layer_norm_parameters_weight_ + ) + add_23 = l_self_modules_encoder_modules_layer_modules_1_modules_ffn_modules_0_modules_output_modules_layer_norm_parameters_weight_ = (None) + layer_outputs_14 = ( + mul_13 + + l_self_modules_encoder_modules_layer_modules_1_modules_ffn_modules_0_modules_output_modules_layer_norm_parameters_bias_ + ) + mul_13 = l_self_modules_encoder_modules_layer_modules_1_modules_ffn_modules_0_modules_output_modules_layer_norm_parameters_bias_ = (None) + hidden_states_10 = torch._C._nn.linear( + layer_outputs_14, + l_self_modules_encoder_modules_layer_modules_1_modules_ffn_modules_1_modules_intermediate_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_1_modules_ffn_modules_1_modules_intermediate_modules_dense_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_1_modules_ffn_modules_1_modules_intermediate_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_1_modules_ffn_modules_1_modules_intermediate_modules_dense_parameters_bias_ = (None) + hidden_states_11 = torch.nn.functional.relu(hidden_states_10, inplace=False) + hidden_states_10 = None + layer_outputs_15 = torch._C._nn.linear( + hidden_states_11, + l_self_modules_encoder_modules_layer_modules_1_modules_ffn_modules_1_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_1_modules_ffn_modules_1_modules_output_modules_dense_parameters_bias_, + ) + hidden_states_11 = l_self_modules_encoder_modules_layer_modules_1_modules_ffn_modules_1_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_1_modules_ffn_modules_1_modules_output_modules_dense_parameters_bias_ = (None) + add_25 = layer_outputs_15 + layer_outputs_14 + layer_outputs_15 = layer_outputs_14 = None + mul_14 = ( + add_25 + * l_self_modules_encoder_modules_layer_modules_1_modules_ffn_modules_1_modules_output_modules_layer_norm_parameters_weight_ + ) + add_25 = l_self_modules_encoder_modules_layer_modules_1_modules_ffn_modules_1_modules_output_modules_layer_norm_parameters_weight_ = (None) + layer_outputs_16 = ( + mul_14 + + l_self_modules_encoder_modules_layer_modules_1_modules_ffn_modules_1_modules_output_modules_layer_norm_parameters_bias_ + ) + mul_14 = l_self_modules_encoder_modules_layer_modules_1_modules_ffn_modules_1_modules_output_modules_layer_norm_parameters_bias_ = (None) + hidden_states_12 = torch._C._nn.linear( + layer_outputs_16, + l_self_modules_encoder_modules_layer_modules_1_modules_ffn_modules_2_modules_intermediate_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_1_modules_ffn_modules_2_modules_intermediate_modules_dense_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_1_modules_ffn_modules_2_modules_intermediate_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_1_modules_ffn_modules_2_modules_intermediate_modules_dense_parameters_bias_ = (None) + hidden_states_13 = torch.nn.functional.relu(hidden_states_12, inplace=False) + hidden_states_12 = None + layer_outputs_17 = torch._C._nn.linear( + hidden_states_13, + l_self_modules_encoder_modules_layer_modules_1_modules_ffn_modules_2_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_1_modules_ffn_modules_2_modules_output_modules_dense_parameters_bias_, + ) + hidden_states_13 = l_self_modules_encoder_modules_layer_modules_1_modules_ffn_modules_2_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_1_modules_ffn_modules_2_modules_output_modules_dense_parameters_bias_ = (None) + add_27 = layer_outputs_17 + layer_outputs_16 + layer_outputs_17 = layer_outputs_16 = None + mul_15 = ( + add_27 + * l_self_modules_encoder_modules_layer_modules_1_modules_ffn_modules_2_modules_output_modules_layer_norm_parameters_weight_ + ) + add_27 = l_self_modules_encoder_modules_layer_modules_1_modules_ffn_modules_2_modules_output_modules_layer_norm_parameters_weight_ = (None) + layer_outputs_18 = ( + mul_15 + + l_self_modules_encoder_modules_layer_modules_1_modules_ffn_modules_2_modules_output_modules_layer_norm_parameters_bias_ + ) + mul_15 = l_self_modules_encoder_modules_layer_modules_1_modules_ffn_modules_2_modules_output_modules_layer_norm_parameters_bias_ = (None) + hidden_states_14 = torch._C._nn.linear( + layer_outputs_18, + l_self_modules_encoder_modules_layer_modules_1_modules_intermediate_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_1_modules_intermediate_modules_dense_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_1_modules_intermediate_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_1_modules_intermediate_modules_dense_parameters_bias_ = (None) + hidden_states_15 = torch.nn.functional.relu(hidden_states_14, inplace=False) + hidden_states_14 = None + layer_output_2 = torch._C._nn.linear( + hidden_states_15, + l_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dense_parameters_bias_, + ) + hidden_states_15 = l_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dense_parameters_bias_ = (None) + add_29 = layer_output_2 + layer_outputs_18 + layer_output_2 = layer_outputs_18 = None + mul_16 = ( + add_29 + * l_self_modules_encoder_modules_layer_modules_1_modules_output_modules_layer_norm_parameters_weight_ + ) + add_29 = l_self_modules_encoder_modules_layer_modules_1_modules_output_modules_layer_norm_parameters_weight_ = (None) + layer_output_3 = ( + mul_16 + + l_self_modules_encoder_modules_layer_modules_1_modules_output_modules_layer_norm_parameters_bias_ + ) + mul_16 = l_self_modules_encoder_modules_layer_modules_1_modules_output_modules_layer_norm_parameters_bias_ = (None) + layer_outputs_19 = torch._C._nn.linear( + layer_output_3, + l_self_modules_encoder_modules_layer_modules_1_modules_output_modules_bottleneck_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_1_modules_output_modules_bottleneck_modules_dense_parameters_bias_, + ) + layer_output_3 = l_self_modules_encoder_modules_layer_modules_1_modules_output_modules_bottleneck_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_1_modules_output_modules_bottleneck_modules_dense_parameters_bias_ = (None) + layer_outputs_20 = torch.nn.functional.dropout( + layer_outputs_19, 0.0, False, False + ) + layer_outputs_19 = None + add_31 = layer_outputs_20 + layer_outputs_10 + layer_outputs_20 = layer_outputs_10 = None + mul_17 = ( + add_31 + * l_self_modules_encoder_modules_layer_modules_1_modules_output_modules_bottleneck_modules_layer_norm_parameters_weight_ + ) + add_31 = l_self_modules_encoder_modules_layer_modules_1_modules_output_modules_bottleneck_modules_layer_norm_parameters_weight_ = (None) + layer_outputs_21 = ( + mul_17 + + l_self_modules_encoder_modules_layer_modules_1_modules_output_modules_bottleneck_modules_layer_norm_parameters_bias_ + ) + mul_17 = l_self_modules_encoder_modules_layer_modules_1_modules_output_modules_bottleneck_modules_layer_norm_parameters_bias_ = (None) + tensor_1 = torch.tensor(1000) + tensor_1 = None + first_token_tensor = layer_outputs_21[(slice(None, None, None), 0)] + pooled_output = torch._C._nn.linear( + first_token_tensor, + l_self_modules_pooler_modules_dense_parameters_weight_, + l_self_modules_pooler_modules_dense_parameters_bias_, + ) + first_token_tensor = ( + l_self_modules_pooler_modules_dense_parameters_weight_ + ) = l_self_modules_pooler_modules_dense_parameters_bias_ = None + pooled_output_1 = torch.tanh(pooled_output) + pooled_output = None + return (layer_outputs_21, pooled_output_1) diff --git a/samples/transformers-auto-model/hossamamer12_BUS15100_MB2_20epoch_notweettokenizer_fp16/weight_meta.py b/samples/transformers-auto-model/hossamamer12_BUS15100_MB2_20epoch_notweettokenizer_fp16/weight_meta.py new file mode 100644 index 000000000..91226521a --- /dev/null +++ b/samples/transformers-auto-model/hossamamer12_BUS15100_MB2_20epoch_notweettokenizer_fp16/weight_meta.py @@ -0,0 +1,1072 @@ +class Program_weight_tensor_meta_L_input_ids_: + name = "L_input_ids_" + shape = [1, 20] + dtype = "torch.int64" + device = "cuda:0" + mean = None + std = None + data = [ + 2, + 966, + 1095, + 45, + 58, + 150, + 118, + 1460, + 94, + 2005, + 1673, + 1376, + 399, + 141, + 966, + 1095, + 45, + 51, + 1, + 3, + ] + + +class Program_weight_tensor_meta_L_attention_mask_: + name = "L_attention_mask_" + shape = [1, 20] + dtype = "torch.int64" + device = "cuda:0" + mean = None + std = None + data = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] + + +class Program_weight_tensor_meta_L_token_type_ids_: + name = "L_token_type_ids_" + shape = [1, 20] + dtype = "torch.int64" + device = "cuda:0" + mean = None + std = None + data = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] + + +class Program_weight_tensor_meta_L_self_modules_embeddings_modules_word_embeddings_parameters_weight_: + name = "L_self_modules_embeddings_modules_word_embeddings_parameters_weight_" + shape = [2016, 128] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_embeddings_buffers_position_ids_: + name = "L_self_modules_embeddings_buffers_position_ids_" + shape = [1, 512] + dtype = "torch.int64" + device = "cuda:0" + mean = None + std = None + min_val = 0 + max_val = 511 + + +class Program_weight_tensor_meta_L_self_modules_embeddings_modules_embedding_transformation_parameters_weight_: + name = ( + "L_self_modules_embeddings_modules_embedding_transformation_parameters_weight_" + ) + shape = [512, 384] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_embeddings_modules_embedding_transformation_parameters_bias_: + name = "L_self_modules_embeddings_modules_embedding_transformation_parameters_bias_" + shape = [512] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_embeddings_modules_position_embeddings_parameters_weight_: + name = "L_self_modules_embeddings_modules_position_embeddings_parameters_weight_" + shape = [512, 512] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_embeddings_modules_token_type_embeddings_parameters_weight_: + name = "L_self_modules_embeddings_modules_token_type_embeddings_parameters_weight_" + shape = [2, 512] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.001 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_embeddings_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_embeddings_modules_LayerNorm_parameters_weight_" + shape = [512] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_embeddings_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_embeddings_modules_LayerNorm_parameters_bias_" + shape = [512] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_bottleneck_modules_input_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_bottleneck_modules_input_modules_dense_parameters_weight_" + shape = [128, 512] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_bottleneck_modules_input_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_bottleneck_modules_input_modules_dense_parameters_bias_" + shape = [128] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_bottleneck_modules_input_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_bottleneck_modules_input_modules_LayerNorm_parameters_weight_" + shape = [128] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_bottleneck_modules_input_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_bottleneck_modules_input_modules_LayerNorm_parameters_bias_" + shape = [128] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_bottleneck_modules_attention_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_bottleneck_modules_attention_modules_dense_parameters_weight_" + shape = [128, 512] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_bottleneck_modules_attention_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_bottleneck_modules_attention_modules_dense_parameters_bias_" + shape = [128] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_bottleneck_modules_attention_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_bottleneck_modules_attention_modules_LayerNorm_parameters_weight_" + shape = [128] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_bottleneck_modules_attention_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_bottleneck_modules_attention_modules_LayerNorm_parameters_bias_" + shape = [128] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_query_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_query_parameters_weight_" + shape = [128, 128] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_query_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_query_parameters_bias_" + shape = [128] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_key_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_key_parameters_weight_" + shape = [128, 128] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_key_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_key_parameters_bias_" + shape = [128] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_value_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_value_parameters_weight_" + shape = [128, 512] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_value_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_value_parameters_bias_" + shape = [128] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dense_parameters_weight_" + shape = [128, 128] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dense_parameters_bias_" + shape = [128] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_LayerNorm_parameters_weight_" + shape = [128] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_LayerNorm_parameters_bias_" + shape = [128] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_ffn_modules_0_modules_intermediate_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_ffn_modules_0_modules_intermediate_modules_dense_parameters_weight_" + shape = [512, 128] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_ffn_modules_0_modules_intermediate_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_ffn_modules_0_modules_intermediate_modules_dense_parameters_bias_" + shape = [512] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_ffn_modules_0_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_ffn_modules_0_modules_output_modules_dense_parameters_weight_" + shape = [128, 512] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_ffn_modules_0_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_ffn_modules_0_modules_output_modules_dense_parameters_bias_" + shape = [128] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_ffn_modules_0_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_ffn_modules_0_modules_output_modules_LayerNorm_parameters_weight_" + shape = [128] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_ffn_modules_0_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_ffn_modules_0_modules_output_modules_LayerNorm_parameters_bias_" + shape = [128] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_ffn_modules_1_modules_intermediate_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_ffn_modules_1_modules_intermediate_modules_dense_parameters_weight_" + shape = [512, 128] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_ffn_modules_1_modules_intermediate_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_ffn_modules_1_modules_intermediate_modules_dense_parameters_bias_" + shape = [512] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_ffn_modules_1_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_ffn_modules_1_modules_output_modules_dense_parameters_weight_" + shape = [128, 512] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_ffn_modules_1_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_ffn_modules_1_modules_output_modules_dense_parameters_bias_" + shape = [128] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_ffn_modules_1_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_ffn_modules_1_modules_output_modules_LayerNorm_parameters_weight_" + shape = [128] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_ffn_modules_1_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_ffn_modules_1_modules_output_modules_LayerNorm_parameters_bias_" + shape = [128] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_ffn_modules_2_modules_intermediate_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_ffn_modules_2_modules_intermediate_modules_dense_parameters_weight_" + shape = [512, 128] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_ffn_modules_2_modules_intermediate_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_ffn_modules_2_modules_intermediate_modules_dense_parameters_bias_" + shape = [512] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_ffn_modules_2_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_ffn_modules_2_modules_output_modules_dense_parameters_weight_" + shape = [128, 512] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_ffn_modules_2_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_ffn_modules_2_modules_output_modules_dense_parameters_bias_" + shape = [128] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_ffn_modules_2_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_ffn_modules_2_modules_output_modules_LayerNorm_parameters_weight_" + shape = [128] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_ffn_modules_2_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_ffn_modules_2_modules_output_modules_LayerNorm_parameters_bias_" + shape = [128] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_intermediate_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_intermediate_modules_dense_parameters_weight_" + shape = [512, 128] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_intermediate_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_intermediate_modules_dense_parameters_bias_" + shape = [512] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dense_parameters_weight_" + shape = [128, 512] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dense_parameters_bias_" + shape = [128] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_LayerNorm_parameters_weight_" + shape = [128] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_LayerNorm_parameters_bias_" + shape = [128] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_bottleneck_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_bottleneck_modules_dense_parameters_weight_" + shape = [512, 128] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_bottleneck_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_bottleneck_modules_dense_parameters_bias_" + shape = [512] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_bottleneck_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_bottleneck_modules_LayerNorm_parameters_weight_" + shape = [512] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_bottleneck_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_bottleneck_modules_LayerNorm_parameters_bias_" + shape = [512] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_bottleneck_modules_input_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_bottleneck_modules_input_modules_dense_parameters_weight_" + shape = [128, 512] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_bottleneck_modules_input_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_bottleneck_modules_input_modules_dense_parameters_bias_" + shape = [128] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_bottleneck_modules_input_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_bottleneck_modules_input_modules_LayerNorm_parameters_weight_" + shape = [128] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_bottleneck_modules_input_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_bottleneck_modules_input_modules_LayerNorm_parameters_bias_" + shape = [128] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_bottleneck_modules_attention_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_bottleneck_modules_attention_modules_dense_parameters_weight_" + shape = [128, 512] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_bottleneck_modules_attention_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_bottleneck_modules_attention_modules_dense_parameters_bias_" + shape = [128] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_bottleneck_modules_attention_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_bottleneck_modules_attention_modules_LayerNorm_parameters_weight_" + shape = [128] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_bottleneck_modules_attention_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_bottleneck_modules_attention_modules_LayerNorm_parameters_bias_" + shape = [128] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_query_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_query_parameters_weight_" + shape = [128, 128] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_query_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_query_parameters_bias_" + shape = [128] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_key_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_key_parameters_weight_" + shape = [128, 128] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_key_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_key_parameters_bias_" + shape = [128] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_value_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_value_parameters_weight_" + shape = [128, 512] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_value_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_value_parameters_bias_" + shape = [128] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dense_parameters_weight_" + shape = [128, 128] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dense_parameters_bias_" + shape = [128] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_LayerNorm_parameters_weight_" + shape = [128] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_LayerNorm_parameters_bias_" + shape = [128] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_ffn_modules_0_modules_intermediate_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_ffn_modules_0_modules_intermediate_modules_dense_parameters_weight_" + shape = [512, 128] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_ffn_modules_0_modules_intermediate_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_ffn_modules_0_modules_intermediate_modules_dense_parameters_bias_" + shape = [512] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_ffn_modules_0_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_ffn_modules_0_modules_output_modules_dense_parameters_weight_" + shape = [128, 512] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_ffn_modules_0_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_ffn_modules_0_modules_output_modules_dense_parameters_bias_" + shape = [128] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_ffn_modules_0_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_ffn_modules_0_modules_output_modules_LayerNorm_parameters_weight_" + shape = [128] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_ffn_modules_0_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_ffn_modules_0_modules_output_modules_LayerNorm_parameters_bias_" + shape = [128] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_ffn_modules_1_modules_intermediate_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_ffn_modules_1_modules_intermediate_modules_dense_parameters_weight_" + shape = [512, 128] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_ffn_modules_1_modules_intermediate_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_ffn_modules_1_modules_intermediate_modules_dense_parameters_bias_" + shape = [512] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_ffn_modules_1_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_ffn_modules_1_modules_output_modules_dense_parameters_weight_" + shape = [128, 512] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_ffn_modules_1_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_ffn_modules_1_modules_output_modules_dense_parameters_bias_" + shape = [128] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_ffn_modules_1_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_ffn_modules_1_modules_output_modules_LayerNorm_parameters_weight_" + shape = [128] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_ffn_modules_1_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_ffn_modules_1_modules_output_modules_LayerNorm_parameters_bias_" + shape = [128] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_ffn_modules_2_modules_intermediate_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_ffn_modules_2_modules_intermediate_modules_dense_parameters_weight_" + shape = [512, 128] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_ffn_modules_2_modules_intermediate_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_ffn_modules_2_modules_intermediate_modules_dense_parameters_bias_" + shape = [512] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_ffn_modules_2_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_ffn_modules_2_modules_output_modules_dense_parameters_weight_" + shape = [128, 512] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_ffn_modules_2_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_ffn_modules_2_modules_output_modules_dense_parameters_bias_" + shape = [128] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_ffn_modules_2_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_ffn_modules_2_modules_output_modules_LayerNorm_parameters_weight_" + shape = [128] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_ffn_modules_2_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_ffn_modules_2_modules_output_modules_LayerNorm_parameters_bias_" + shape = [128] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_intermediate_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_intermediate_modules_dense_parameters_weight_" + shape = [512, 128] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_intermediate_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_intermediate_modules_dense_parameters_bias_" + shape = [512] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dense_parameters_weight_" + shape = [128, 512] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dense_parameters_bias_" + shape = [128] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_LayerNorm_parameters_weight_" + shape = [128] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_LayerNorm_parameters_bias_" + shape = [128] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_bottleneck_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_bottleneck_modules_dense_parameters_weight_" + shape = [512, 128] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_bottleneck_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_bottleneck_modules_dense_parameters_bias_" + shape = [512] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_bottleneck_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_bottleneck_modules_LayerNorm_parameters_weight_" + shape = [512] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_bottleneck_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_bottleneck_modules_LayerNorm_parameters_bias_" + shape = [512] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_pooler_modules_dense_parameters_weight_: + name = "L_self_modules_pooler_modules_dense_parameters_weight_" + shape = [512, 512] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_pooler_modules_dense_parameters_bias_: + name = "L_self_modules_pooler_modules_dense_parameters_bias_" + shape = [512] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None diff --git a/samples/transformers-auto-model/moska_plt5-seq-clf-with-entities-updated-finetuned/graph_hash.txt b/samples/transformers-auto-model/moska_plt5-seq-clf-with-entities-updated-finetuned/graph_hash.txt new file mode 100644 index 000000000..10a8fd148 --- /dev/null +++ b/samples/transformers-auto-model/moska_plt5-seq-clf-with-entities-updated-finetuned/graph_hash.txt @@ -0,0 +1 @@ +9ccd4f0e239878f550df49d0608cf7edbf9811412e7c91ef39084873a72496df \ No newline at end of file diff --git a/samples/transformers-auto-model/moska_plt5-seq-clf-with-entities-updated-finetuned/graph_net.json b/samples/transformers-auto-model/moska_plt5-seq-clf-with-entities-updated-finetuned/graph_net.json new file mode 100644 index 000000000..b6ffe9f72 --- /dev/null +++ b/samples/transformers-auto-model/moska_plt5-seq-clf-with-entities-updated-finetuned/graph_net.json @@ -0,0 +1,6 @@ +{ + "framework": "torch", + "num_devices_required": 1, + "num_nodes_required": 1, + "dynamic": false +} \ No newline at end of file diff --git a/samples/transformers-auto-model/moska_plt5-seq-clf-with-entities-updated-finetuned/input_meta.py b/samples/transformers-auto-model/moska_plt5-seq-clf-with-entities-updated-finetuned/input_meta.py new file mode 100644 index 000000000..e69de29bb diff --git a/samples/transformers-auto-model/moska_plt5-seq-clf-with-entities-updated-finetuned/input_tensor_constraints.py b/samples/transformers-auto-model/moska_plt5-seq-clf-with-entities-updated-finetuned/input_tensor_constraints.py new file mode 100644 index 000000000..e69de29bb diff --git a/samples/transformers-auto-model/moska_plt5-seq-clf-with-entities-updated-finetuned/model.py b/samples/transformers-auto-model/moska_plt5-seq-clf-with-entities-updated-finetuned/model.py new file mode 100644 index 000000000..ebe4c0454 --- /dev/null +++ b/samples/transformers-auto-model/moska_plt5-seq-clf-with-entities-updated-finetuned/model.py @@ -0,0 +1,2301 @@ +import torch + +from torch import device + + +class GraphModule(torch.nn.Module): + def forward( + self, + L_inputs_embeds_: torch.Tensor, + L_encoder_hidden_states_: torch.Tensor, + L_encoder_attention_mask_: torch.Tensor, + L_self_modules_block_modules_0_modules_layer_modules_0_modules_layer_norm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_block_modules_0_modules_layer_modules_0_modules_SelfAttention_modules_q_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_block_modules_0_modules_layer_modules_0_modules_SelfAttention_modules_k_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_block_modules_0_modules_layer_modules_0_modules_SelfAttention_modules_v_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_block_modules_0_modules_layer_modules_0_modules_SelfAttention_modules_relative_attention_bias_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_block_modules_0_modules_layer_modules_0_modules_SelfAttention_modules_o_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_block_modules_0_modules_layer_modules_1_modules_layer_norm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_block_modules_0_modules_layer_modules_1_modules_EncDecAttention_modules_q_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_block_modules_0_modules_layer_modules_1_modules_EncDecAttention_modules_k_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_block_modules_0_modules_layer_modules_1_modules_EncDecAttention_modules_v_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_block_modules_0_modules_layer_modules_1_modules_EncDecAttention_modules_o_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_block_modules_0_modules_layer_modules_2_modules_layer_norm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_block_modules_0_modules_layer_modules_2_modules_DenseReluDense_modules_wi_0_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_block_modules_0_modules_layer_modules_2_modules_DenseReluDense_modules_wi_1_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_block_modules_0_modules_layer_modules_2_modules_DenseReluDense_modules_wo_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_block_modules_1_modules_layer_modules_0_modules_layer_norm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_block_modules_1_modules_layer_modules_0_modules_SelfAttention_modules_q_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_block_modules_1_modules_layer_modules_0_modules_SelfAttention_modules_k_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_block_modules_1_modules_layer_modules_0_modules_SelfAttention_modules_v_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_block_modules_1_modules_layer_modules_0_modules_SelfAttention_modules_o_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_block_modules_1_modules_layer_modules_1_modules_layer_norm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_block_modules_1_modules_layer_modules_1_modules_EncDecAttention_modules_q_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_block_modules_1_modules_layer_modules_1_modules_EncDecAttention_modules_k_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_block_modules_1_modules_layer_modules_1_modules_EncDecAttention_modules_v_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_block_modules_1_modules_layer_modules_1_modules_EncDecAttention_modules_o_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_block_modules_1_modules_layer_modules_2_modules_layer_norm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_block_modules_1_modules_layer_modules_2_modules_DenseReluDense_modules_wi_0_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_block_modules_1_modules_layer_modules_2_modules_DenseReluDense_modules_wi_1_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_block_modules_1_modules_layer_modules_2_modules_DenseReluDense_modules_wo_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_block_modules_2_modules_layer_modules_0_modules_layer_norm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_block_modules_2_modules_layer_modules_0_modules_SelfAttention_modules_q_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_block_modules_2_modules_layer_modules_0_modules_SelfAttention_modules_k_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_block_modules_2_modules_layer_modules_0_modules_SelfAttention_modules_v_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_block_modules_2_modules_layer_modules_0_modules_SelfAttention_modules_o_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_block_modules_2_modules_layer_modules_1_modules_layer_norm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_block_modules_2_modules_layer_modules_1_modules_EncDecAttention_modules_q_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_block_modules_2_modules_layer_modules_1_modules_EncDecAttention_modules_k_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_block_modules_2_modules_layer_modules_1_modules_EncDecAttention_modules_v_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_block_modules_2_modules_layer_modules_1_modules_EncDecAttention_modules_o_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_block_modules_2_modules_layer_modules_2_modules_layer_norm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_block_modules_2_modules_layer_modules_2_modules_DenseReluDense_modules_wi_0_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_block_modules_2_modules_layer_modules_2_modules_DenseReluDense_modules_wi_1_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_block_modules_2_modules_layer_modules_2_modules_DenseReluDense_modules_wo_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_block_modules_3_modules_layer_modules_0_modules_layer_norm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_block_modules_3_modules_layer_modules_0_modules_SelfAttention_modules_q_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_block_modules_3_modules_layer_modules_0_modules_SelfAttention_modules_k_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_block_modules_3_modules_layer_modules_0_modules_SelfAttention_modules_v_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_block_modules_3_modules_layer_modules_0_modules_SelfAttention_modules_o_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_block_modules_3_modules_layer_modules_1_modules_layer_norm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_block_modules_3_modules_layer_modules_1_modules_EncDecAttention_modules_q_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_block_modules_3_modules_layer_modules_1_modules_EncDecAttention_modules_k_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_block_modules_3_modules_layer_modules_1_modules_EncDecAttention_modules_v_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_block_modules_3_modules_layer_modules_1_modules_EncDecAttention_modules_o_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_block_modules_3_modules_layer_modules_2_modules_layer_norm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_block_modules_3_modules_layer_modules_2_modules_DenseReluDense_modules_wi_0_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_block_modules_3_modules_layer_modules_2_modules_DenseReluDense_modules_wi_1_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_block_modules_3_modules_layer_modules_2_modules_DenseReluDense_modules_wo_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_block_modules_4_modules_layer_modules_0_modules_layer_norm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_block_modules_4_modules_layer_modules_0_modules_SelfAttention_modules_q_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_block_modules_4_modules_layer_modules_0_modules_SelfAttention_modules_k_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_block_modules_4_modules_layer_modules_0_modules_SelfAttention_modules_v_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_block_modules_4_modules_layer_modules_0_modules_SelfAttention_modules_o_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_block_modules_4_modules_layer_modules_1_modules_layer_norm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_block_modules_4_modules_layer_modules_1_modules_EncDecAttention_modules_q_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_block_modules_4_modules_layer_modules_1_modules_EncDecAttention_modules_k_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_block_modules_4_modules_layer_modules_1_modules_EncDecAttention_modules_v_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_block_modules_4_modules_layer_modules_1_modules_EncDecAttention_modules_o_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_block_modules_4_modules_layer_modules_2_modules_layer_norm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_block_modules_4_modules_layer_modules_2_modules_DenseReluDense_modules_wi_0_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_block_modules_4_modules_layer_modules_2_modules_DenseReluDense_modules_wi_1_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_block_modules_4_modules_layer_modules_2_modules_DenseReluDense_modules_wo_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_block_modules_5_modules_layer_modules_0_modules_layer_norm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_block_modules_5_modules_layer_modules_0_modules_SelfAttention_modules_q_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_block_modules_5_modules_layer_modules_0_modules_SelfAttention_modules_k_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_block_modules_5_modules_layer_modules_0_modules_SelfAttention_modules_v_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_block_modules_5_modules_layer_modules_0_modules_SelfAttention_modules_o_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_block_modules_5_modules_layer_modules_1_modules_layer_norm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_block_modules_5_modules_layer_modules_1_modules_EncDecAttention_modules_q_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_block_modules_5_modules_layer_modules_1_modules_EncDecAttention_modules_k_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_block_modules_5_modules_layer_modules_1_modules_EncDecAttention_modules_v_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_block_modules_5_modules_layer_modules_1_modules_EncDecAttention_modules_o_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_block_modules_5_modules_layer_modules_2_modules_layer_norm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_block_modules_5_modules_layer_modules_2_modules_DenseReluDense_modules_wi_0_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_block_modules_5_modules_layer_modules_2_modules_DenseReluDense_modules_wi_1_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_block_modules_5_modules_layer_modules_2_modules_DenseReluDense_modules_wo_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_block_modules_6_modules_layer_modules_0_modules_layer_norm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_block_modules_6_modules_layer_modules_0_modules_SelfAttention_modules_q_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_block_modules_6_modules_layer_modules_0_modules_SelfAttention_modules_k_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_block_modules_6_modules_layer_modules_0_modules_SelfAttention_modules_v_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_block_modules_6_modules_layer_modules_0_modules_SelfAttention_modules_o_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_block_modules_6_modules_layer_modules_1_modules_layer_norm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_block_modules_6_modules_layer_modules_1_modules_EncDecAttention_modules_q_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_block_modules_6_modules_layer_modules_1_modules_EncDecAttention_modules_k_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_block_modules_6_modules_layer_modules_1_modules_EncDecAttention_modules_v_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_block_modules_6_modules_layer_modules_1_modules_EncDecAttention_modules_o_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_block_modules_6_modules_layer_modules_2_modules_layer_norm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_block_modules_6_modules_layer_modules_2_modules_DenseReluDense_modules_wi_0_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_block_modules_6_modules_layer_modules_2_modules_DenseReluDense_modules_wi_1_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_block_modules_6_modules_layer_modules_2_modules_DenseReluDense_modules_wo_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_block_modules_7_modules_layer_modules_0_modules_layer_norm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_block_modules_7_modules_layer_modules_0_modules_SelfAttention_modules_q_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_block_modules_7_modules_layer_modules_0_modules_SelfAttention_modules_k_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_block_modules_7_modules_layer_modules_0_modules_SelfAttention_modules_v_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_block_modules_7_modules_layer_modules_0_modules_SelfAttention_modules_o_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_block_modules_7_modules_layer_modules_1_modules_layer_norm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_block_modules_7_modules_layer_modules_1_modules_EncDecAttention_modules_q_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_block_modules_7_modules_layer_modules_1_modules_EncDecAttention_modules_k_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_block_modules_7_modules_layer_modules_1_modules_EncDecAttention_modules_v_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_block_modules_7_modules_layer_modules_1_modules_EncDecAttention_modules_o_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_block_modules_7_modules_layer_modules_2_modules_layer_norm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_block_modules_7_modules_layer_modules_2_modules_DenseReluDense_modules_wi_0_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_block_modules_7_modules_layer_modules_2_modules_DenseReluDense_modules_wi_1_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_block_modules_7_modules_layer_modules_2_modules_DenseReluDense_modules_wo_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_final_layer_norm_parameters_weight_: torch.nn.parameter.Parameter, + ): + l_inputs_embeds_ = L_inputs_embeds_ + l_encoder_hidden_states_ = L_encoder_hidden_states_ + l_encoder_attention_mask_ = L_encoder_attention_mask_ + l_self_modules_block_modules_0_modules_layer_modules_0_modules_layer_norm_parameters_weight_ = L_self_modules_block_modules_0_modules_layer_modules_0_modules_layer_norm_parameters_weight_ + l_self_modules_block_modules_0_modules_layer_modules_0_modules_self_attention_modules_q_parameters_weight_ = L_self_modules_block_modules_0_modules_layer_modules_0_modules_SelfAttention_modules_q_parameters_weight_ + l_self_modules_block_modules_0_modules_layer_modules_0_modules_self_attention_modules_k_parameters_weight_ = L_self_modules_block_modules_0_modules_layer_modules_0_modules_SelfAttention_modules_k_parameters_weight_ + l_self_modules_block_modules_0_modules_layer_modules_0_modules_self_attention_modules_v_parameters_weight_ = L_self_modules_block_modules_0_modules_layer_modules_0_modules_SelfAttention_modules_v_parameters_weight_ + l_self_modules_block_modules_0_modules_layer_modules_0_modules_self_attention_modules_relative_attention_bias_parameters_weight_ = L_self_modules_block_modules_0_modules_layer_modules_0_modules_SelfAttention_modules_relative_attention_bias_parameters_weight_ + l_self_modules_block_modules_0_modules_layer_modules_0_modules_self_attention_modules_o_parameters_weight_ = L_self_modules_block_modules_0_modules_layer_modules_0_modules_SelfAttention_modules_o_parameters_weight_ + l_self_modules_block_modules_0_modules_layer_modules_1_modules_layer_norm_parameters_weight_ = L_self_modules_block_modules_0_modules_layer_modules_1_modules_layer_norm_parameters_weight_ + l_self_modules_block_modules_0_modules_layer_modules_1_modules_enc_dec_attention_modules_q_parameters_weight_ = L_self_modules_block_modules_0_modules_layer_modules_1_modules_EncDecAttention_modules_q_parameters_weight_ + l_self_modules_block_modules_0_modules_layer_modules_1_modules_enc_dec_attention_modules_k_parameters_weight_ = L_self_modules_block_modules_0_modules_layer_modules_1_modules_EncDecAttention_modules_k_parameters_weight_ + l_self_modules_block_modules_0_modules_layer_modules_1_modules_enc_dec_attention_modules_v_parameters_weight_ = L_self_modules_block_modules_0_modules_layer_modules_1_modules_EncDecAttention_modules_v_parameters_weight_ + l_self_modules_block_modules_0_modules_layer_modules_1_modules_enc_dec_attention_modules_o_parameters_weight_ = L_self_modules_block_modules_0_modules_layer_modules_1_modules_EncDecAttention_modules_o_parameters_weight_ + l_self_modules_block_modules_0_modules_layer_modules_2_modules_layer_norm_parameters_weight_ = L_self_modules_block_modules_0_modules_layer_modules_2_modules_layer_norm_parameters_weight_ + l_self_modules_block_modules_0_modules_layer_modules_2_modules_dense_relu_dense_modules_wi_0_parameters_weight_ = L_self_modules_block_modules_0_modules_layer_modules_2_modules_DenseReluDense_modules_wi_0_parameters_weight_ + l_self_modules_block_modules_0_modules_layer_modules_2_modules_dense_relu_dense_modules_wi_1_parameters_weight_ = L_self_modules_block_modules_0_modules_layer_modules_2_modules_DenseReluDense_modules_wi_1_parameters_weight_ + l_self_modules_block_modules_0_modules_layer_modules_2_modules_dense_relu_dense_modules_wo_parameters_weight_ = L_self_modules_block_modules_0_modules_layer_modules_2_modules_DenseReluDense_modules_wo_parameters_weight_ + l_self_modules_block_modules_1_modules_layer_modules_0_modules_layer_norm_parameters_weight_ = L_self_modules_block_modules_1_modules_layer_modules_0_modules_layer_norm_parameters_weight_ + l_self_modules_block_modules_1_modules_layer_modules_0_modules_self_attention_modules_q_parameters_weight_ = L_self_modules_block_modules_1_modules_layer_modules_0_modules_SelfAttention_modules_q_parameters_weight_ + l_self_modules_block_modules_1_modules_layer_modules_0_modules_self_attention_modules_k_parameters_weight_ = L_self_modules_block_modules_1_modules_layer_modules_0_modules_SelfAttention_modules_k_parameters_weight_ + l_self_modules_block_modules_1_modules_layer_modules_0_modules_self_attention_modules_v_parameters_weight_ = L_self_modules_block_modules_1_modules_layer_modules_0_modules_SelfAttention_modules_v_parameters_weight_ + l_self_modules_block_modules_1_modules_layer_modules_0_modules_self_attention_modules_o_parameters_weight_ = L_self_modules_block_modules_1_modules_layer_modules_0_modules_SelfAttention_modules_o_parameters_weight_ + l_self_modules_block_modules_1_modules_layer_modules_1_modules_layer_norm_parameters_weight_ = L_self_modules_block_modules_1_modules_layer_modules_1_modules_layer_norm_parameters_weight_ + l_self_modules_block_modules_1_modules_layer_modules_1_modules_enc_dec_attention_modules_q_parameters_weight_ = L_self_modules_block_modules_1_modules_layer_modules_1_modules_EncDecAttention_modules_q_parameters_weight_ + l_self_modules_block_modules_1_modules_layer_modules_1_modules_enc_dec_attention_modules_k_parameters_weight_ = L_self_modules_block_modules_1_modules_layer_modules_1_modules_EncDecAttention_modules_k_parameters_weight_ + l_self_modules_block_modules_1_modules_layer_modules_1_modules_enc_dec_attention_modules_v_parameters_weight_ = L_self_modules_block_modules_1_modules_layer_modules_1_modules_EncDecAttention_modules_v_parameters_weight_ + l_self_modules_block_modules_1_modules_layer_modules_1_modules_enc_dec_attention_modules_o_parameters_weight_ = L_self_modules_block_modules_1_modules_layer_modules_1_modules_EncDecAttention_modules_o_parameters_weight_ + l_self_modules_block_modules_1_modules_layer_modules_2_modules_layer_norm_parameters_weight_ = L_self_modules_block_modules_1_modules_layer_modules_2_modules_layer_norm_parameters_weight_ + l_self_modules_block_modules_1_modules_layer_modules_2_modules_dense_relu_dense_modules_wi_0_parameters_weight_ = L_self_modules_block_modules_1_modules_layer_modules_2_modules_DenseReluDense_modules_wi_0_parameters_weight_ + l_self_modules_block_modules_1_modules_layer_modules_2_modules_dense_relu_dense_modules_wi_1_parameters_weight_ = L_self_modules_block_modules_1_modules_layer_modules_2_modules_DenseReluDense_modules_wi_1_parameters_weight_ + l_self_modules_block_modules_1_modules_layer_modules_2_modules_dense_relu_dense_modules_wo_parameters_weight_ = L_self_modules_block_modules_1_modules_layer_modules_2_modules_DenseReluDense_modules_wo_parameters_weight_ + l_self_modules_block_modules_2_modules_layer_modules_0_modules_layer_norm_parameters_weight_ = L_self_modules_block_modules_2_modules_layer_modules_0_modules_layer_norm_parameters_weight_ + l_self_modules_block_modules_2_modules_layer_modules_0_modules_self_attention_modules_q_parameters_weight_ = L_self_modules_block_modules_2_modules_layer_modules_0_modules_SelfAttention_modules_q_parameters_weight_ + l_self_modules_block_modules_2_modules_layer_modules_0_modules_self_attention_modules_k_parameters_weight_ = L_self_modules_block_modules_2_modules_layer_modules_0_modules_SelfAttention_modules_k_parameters_weight_ + l_self_modules_block_modules_2_modules_layer_modules_0_modules_self_attention_modules_v_parameters_weight_ = L_self_modules_block_modules_2_modules_layer_modules_0_modules_SelfAttention_modules_v_parameters_weight_ + l_self_modules_block_modules_2_modules_layer_modules_0_modules_self_attention_modules_o_parameters_weight_ = L_self_modules_block_modules_2_modules_layer_modules_0_modules_SelfAttention_modules_o_parameters_weight_ + l_self_modules_block_modules_2_modules_layer_modules_1_modules_layer_norm_parameters_weight_ = L_self_modules_block_modules_2_modules_layer_modules_1_modules_layer_norm_parameters_weight_ + l_self_modules_block_modules_2_modules_layer_modules_1_modules_enc_dec_attention_modules_q_parameters_weight_ = L_self_modules_block_modules_2_modules_layer_modules_1_modules_EncDecAttention_modules_q_parameters_weight_ + l_self_modules_block_modules_2_modules_layer_modules_1_modules_enc_dec_attention_modules_k_parameters_weight_ = L_self_modules_block_modules_2_modules_layer_modules_1_modules_EncDecAttention_modules_k_parameters_weight_ + l_self_modules_block_modules_2_modules_layer_modules_1_modules_enc_dec_attention_modules_v_parameters_weight_ = L_self_modules_block_modules_2_modules_layer_modules_1_modules_EncDecAttention_modules_v_parameters_weight_ + l_self_modules_block_modules_2_modules_layer_modules_1_modules_enc_dec_attention_modules_o_parameters_weight_ = L_self_modules_block_modules_2_modules_layer_modules_1_modules_EncDecAttention_modules_o_parameters_weight_ + l_self_modules_block_modules_2_modules_layer_modules_2_modules_layer_norm_parameters_weight_ = L_self_modules_block_modules_2_modules_layer_modules_2_modules_layer_norm_parameters_weight_ + l_self_modules_block_modules_2_modules_layer_modules_2_modules_dense_relu_dense_modules_wi_0_parameters_weight_ = L_self_modules_block_modules_2_modules_layer_modules_2_modules_DenseReluDense_modules_wi_0_parameters_weight_ + l_self_modules_block_modules_2_modules_layer_modules_2_modules_dense_relu_dense_modules_wi_1_parameters_weight_ = L_self_modules_block_modules_2_modules_layer_modules_2_modules_DenseReluDense_modules_wi_1_parameters_weight_ + l_self_modules_block_modules_2_modules_layer_modules_2_modules_dense_relu_dense_modules_wo_parameters_weight_ = L_self_modules_block_modules_2_modules_layer_modules_2_modules_DenseReluDense_modules_wo_parameters_weight_ + l_self_modules_block_modules_3_modules_layer_modules_0_modules_layer_norm_parameters_weight_ = L_self_modules_block_modules_3_modules_layer_modules_0_modules_layer_norm_parameters_weight_ + l_self_modules_block_modules_3_modules_layer_modules_0_modules_self_attention_modules_q_parameters_weight_ = L_self_modules_block_modules_3_modules_layer_modules_0_modules_SelfAttention_modules_q_parameters_weight_ + l_self_modules_block_modules_3_modules_layer_modules_0_modules_self_attention_modules_k_parameters_weight_ = L_self_modules_block_modules_3_modules_layer_modules_0_modules_SelfAttention_modules_k_parameters_weight_ + l_self_modules_block_modules_3_modules_layer_modules_0_modules_self_attention_modules_v_parameters_weight_ = L_self_modules_block_modules_3_modules_layer_modules_0_modules_SelfAttention_modules_v_parameters_weight_ + l_self_modules_block_modules_3_modules_layer_modules_0_modules_self_attention_modules_o_parameters_weight_ = L_self_modules_block_modules_3_modules_layer_modules_0_modules_SelfAttention_modules_o_parameters_weight_ + l_self_modules_block_modules_3_modules_layer_modules_1_modules_layer_norm_parameters_weight_ = L_self_modules_block_modules_3_modules_layer_modules_1_modules_layer_norm_parameters_weight_ + l_self_modules_block_modules_3_modules_layer_modules_1_modules_enc_dec_attention_modules_q_parameters_weight_ = L_self_modules_block_modules_3_modules_layer_modules_1_modules_EncDecAttention_modules_q_parameters_weight_ + l_self_modules_block_modules_3_modules_layer_modules_1_modules_enc_dec_attention_modules_k_parameters_weight_ = L_self_modules_block_modules_3_modules_layer_modules_1_modules_EncDecAttention_modules_k_parameters_weight_ + l_self_modules_block_modules_3_modules_layer_modules_1_modules_enc_dec_attention_modules_v_parameters_weight_ = L_self_modules_block_modules_3_modules_layer_modules_1_modules_EncDecAttention_modules_v_parameters_weight_ + l_self_modules_block_modules_3_modules_layer_modules_1_modules_enc_dec_attention_modules_o_parameters_weight_ = L_self_modules_block_modules_3_modules_layer_modules_1_modules_EncDecAttention_modules_o_parameters_weight_ + l_self_modules_block_modules_3_modules_layer_modules_2_modules_layer_norm_parameters_weight_ = L_self_modules_block_modules_3_modules_layer_modules_2_modules_layer_norm_parameters_weight_ + l_self_modules_block_modules_3_modules_layer_modules_2_modules_dense_relu_dense_modules_wi_0_parameters_weight_ = L_self_modules_block_modules_3_modules_layer_modules_2_modules_DenseReluDense_modules_wi_0_parameters_weight_ + l_self_modules_block_modules_3_modules_layer_modules_2_modules_dense_relu_dense_modules_wi_1_parameters_weight_ = L_self_modules_block_modules_3_modules_layer_modules_2_modules_DenseReluDense_modules_wi_1_parameters_weight_ + l_self_modules_block_modules_3_modules_layer_modules_2_modules_dense_relu_dense_modules_wo_parameters_weight_ = L_self_modules_block_modules_3_modules_layer_modules_2_modules_DenseReluDense_modules_wo_parameters_weight_ + l_self_modules_block_modules_4_modules_layer_modules_0_modules_layer_norm_parameters_weight_ = L_self_modules_block_modules_4_modules_layer_modules_0_modules_layer_norm_parameters_weight_ + l_self_modules_block_modules_4_modules_layer_modules_0_modules_self_attention_modules_q_parameters_weight_ = L_self_modules_block_modules_4_modules_layer_modules_0_modules_SelfAttention_modules_q_parameters_weight_ + l_self_modules_block_modules_4_modules_layer_modules_0_modules_self_attention_modules_k_parameters_weight_ = L_self_modules_block_modules_4_modules_layer_modules_0_modules_SelfAttention_modules_k_parameters_weight_ + l_self_modules_block_modules_4_modules_layer_modules_0_modules_self_attention_modules_v_parameters_weight_ = L_self_modules_block_modules_4_modules_layer_modules_0_modules_SelfAttention_modules_v_parameters_weight_ + l_self_modules_block_modules_4_modules_layer_modules_0_modules_self_attention_modules_o_parameters_weight_ = L_self_modules_block_modules_4_modules_layer_modules_0_modules_SelfAttention_modules_o_parameters_weight_ + l_self_modules_block_modules_4_modules_layer_modules_1_modules_layer_norm_parameters_weight_ = L_self_modules_block_modules_4_modules_layer_modules_1_modules_layer_norm_parameters_weight_ + l_self_modules_block_modules_4_modules_layer_modules_1_modules_enc_dec_attention_modules_q_parameters_weight_ = L_self_modules_block_modules_4_modules_layer_modules_1_modules_EncDecAttention_modules_q_parameters_weight_ + l_self_modules_block_modules_4_modules_layer_modules_1_modules_enc_dec_attention_modules_k_parameters_weight_ = L_self_modules_block_modules_4_modules_layer_modules_1_modules_EncDecAttention_modules_k_parameters_weight_ + l_self_modules_block_modules_4_modules_layer_modules_1_modules_enc_dec_attention_modules_v_parameters_weight_ = L_self_modules_block_modules_4_modules_layer_modules_1_modules_EncDecAttention_modules_v_parameters_weight_ + l_self_modules_block_modules_4_modules_layer_modules_1_modules_enc_dec_attention_modules_o_parameters_weight_ = L_self_modules_block_modules_4_modules_layer_modules_1_modules_EncDecAttention_modules_o_parameters_weight_ + l_self_modules_block_modules_4_modules_layer_modules_2_modules_layer_norm_parameters_weight_ = L_self_modules_block_modules_4_modules_layer_modules_2_modules_layer_norm_parameters_weight_ + l_self_modules_block_modules_4_modules_layer_modules_2_modules_dense_relu_dense_modules_wi_0_parameters_weight_ = L_self_modules_block_modules_4_modules_layer_modules_2_modules_DenseReluDense_modules_wi_0_parameters_weight_ + l_self_modules_block_modules_4_modules_layer_modules_2_modules_dense_relu_dense_modules_wi_1_parameters_weight_ = L_self_modules_block_modules_4_modules_layer_modules_2_modules_DenseReluDense_modules_wi_1_parameters_weight_ + l_self_modules_block_modules_4_modules_layer_modules_2_modules_dense_relu_dense_modules_wo_parameters_weight_ = L_self_modules_block_modules_4_modules_layer_modules_2_modules_DenseReluDense_modules_wo_parameters_weight_ + l_self_modules_block_modules_5_modules_layer_modules_0_modules_layer_norm_parameters_weight_ = L_self_modules_block_modules_5_modules_layer_modules_0_modules_layer_norm_parameters_weight_ + l_self_modules_block_modules_5_modules_layer_modules_0_modules_self_attention_modules_q_parameters_weight_ = L_self_modules_block_modules_5_modules_layer_modules_0_modules_SelfAttention_modules_q_parameters_weight_ + l_self_modules_block_modules_5_modules_layer_modules_0_modules_self_attention_modules_k_parameters_weight_ = L_self_modules_block_modules_5_modules_layer_modules_0_modules_SelfAttention_modules_k_parameters_weight_ + l_self_modules_block_modules_5_modules_layer_modules_0_modules_self_attention_modules_v_parameters_weight_ = L_self_modules_block_modules_5_modules_layer_modules_0_modules_SelfAttention_modules_v_parameters_weight_ + l_self_modules_block_modules_5_modules_layer_modules_0_modules_self_attention_modules_o_parameters_weight_ = L_self_modules_block_modules_5_modules_layer_modules_0_modules_SelfAttention_modules_o_parameters_weight_ + l_self_modules_block_modules_5_modules_layer_modules_1_modules_layer_norm_parameters_weight_ = L_self_modules_block_modules_5_modules_layer_modules_1_modules_layer_norm_parameters_weight_ + l_self_modules_block_modules_5_modules_layer_modules_1_modules_enc_dec_attention_modules_q_parameters_weight_ = L_self_modules_block_modules_5_modules_layer_modules_1_modules_EncDecAttention_modules_q_parameters_weight_ + l_self_modules_block_modules_5_modules_layer_modules_1_modules_enc_dec_attention_modules_k_parameters_weight_ = L_self_modules_block_modules_5_modules_layer_modules_1_modules_EncDecAttention_modules_k_parameters_weight_ + l_self_modules_block_modules_5_modules_layer_modules_1_modules_enc_dec_attention_modules_v_parameters_weight_ = L_self_modules_block_modules_5_modules_layer_modules_1_modules_EncDecAttention_modules_v_parameters_weight_ + l_self_modules_block_modules_5_modules_layer_modules_1_modules_enc_dec_attention_modules_o_parameters_weight_ = L_self_modules_block_modules_5_modules_layer_modules_1_modules_EncDecAttention_modules_o_parameters_weight_ + l_self_modules_block_modules_5_modules_layer_modules_2_modules_layer_norm_parameters_weight_ = L_self_modules_block_modules_5_modules_layer_modules_2_modules_layer_norm_parameters_weight_ + l_self_modules_block_modules_5_modules_layer_modules_2_modules_dense_relu_dense_modules_wi_0_parameters_weight_ = L_self_modules_block_modules_5_modules_layer_modules_2_modules_DenseReluDense_modules_wi_0_parameters_weight_ + l_self_modules_block_modules_5_modules_layer_modules_2_modules_dense_relu_dense_modules_wi_1_parameters_weight_ = L_self_modules_block_modules_5_modules_layer_modules_2_modules_DenseReluDense_modules_wi_1_parameters_weight_ + l_self_modules_block_modules_5_modules_layer_modules_2_modules_dense_relu_dense_modules_wo_parameters_weight_ = L_self_modules_block_modules_5_modules_layer_modules_2_modules_DenseReluDense_modules_wo_parameters_weight_ + l_self_modules_block_modules_6_modules_layer_modules_0_modules_layer_norm_parameters_weight_ = L_self_modules_block_modules_6_modules_layer_modules_0_modules_layer_norm_parameters_weight_ + l_self_modules_block_modules_6_modules_layer_modules_0_modules_self_attention_modules_q_parameters_weight_ = L_self_modules_block_modules_6_modules_layer_modules_0_modules_SelfAttention_modules_q_parameters_weight_ + l_self_modules_block_modules_6_modules_layer_modules_0_modules_self_attention_modules_k_parameters_weight_ = L_self_modules_block_modules_6_modules_layer_modules_0_modules_SelfAttention_modules_k_parameters_weight_ + l_self_modules_block_modules_6_modules_layer_modules_0_modules_self_attention_modules_v_parameters_weight_ = L_self_modules_block_modules_6_modules_layer_modules_0_modules_SelfAttention_modules_v_parameters_weight_ + l_self_modules_block_modules_6_modules_layer_modules_0_modules_self_attention_modules_o_parameters_weight_ = L_self_modules_block_modules_6_modules_layer_modules_0_modules_SelfAttention_modules_o_parameters_weight_ + l_self_modules_block_modules_6_modules_layer_modules_1_modules_layer_norm_parameters_weight_ = L_self_modules_block_modules_6_modules_layer_modules_1_modules_layer_norm_parameters_weight_ + l_self_modules_block_modules_6_modules_layer_modules_1_modules_enc_dec_attention_modules_q_parameters_weight_ = L_self_modules_block_modules_6_modules_layer_modules_1_modules_EncDecAttention_modules_q_parameters_weight_ + l_self_modules_block_modules_6_modules_layer_modules_1_modules_enc_dec_attention_modules_k_parameters_weight_ = L_self_modules_block_modules_6_modules_layer_modules_1_modules_EncDecAttention_modules_k_parameters_weight_ + l_self_modules_block_modules_6_modules_layer_modules_1_modules_enc_dec_attention_modules_v_parameters_weight_ = L_self_modules_block_modules_6_modules_layer_modules_1_modules_EncDecAttention_modules_v_parameters_weight_ + l_self_modules_block_modules_6_modules_layer_modules_1_modules_enc_dec_attention_modules_o_parameters_weight_ = L_self_modules_block_modules_6_modules_layer_modules_1_modules_EncDecAttention_modules_o_parameters_weight_ + l_self_modules_block_modules_6_modules_layer_modules_2_modules_layer_norm_parameters_weight_ = L_self_modules_block_modules_6_modules_layer_modules_2_modules_layer_norm_parameters_weight_ + l_self_modules_block_modules_6_modules_layer_modules_2_modules_dense_relu_dense_modules_wi_0_parameters_weight_ = L_self_modules_block_modules_6_modules_layer_modules_2_modules_DenseReluDense_modules_wi_0_parameters_weight_ + l_self_modules_block_modules_6_modules_layer_modules_2_modules_dense_relu_dense_modules_wi_1_parameters_weight_ = L_self_modules_block_modules_6_modules_layer_modules_2_modules_DenseReluDense_modules_wi_1_parameters_weight_ + l_self_modules_block_modules_6_modules_layer_modules_2_modules_dense_relu_dense_modules_wo_parameters_weight_ = L_self_modules_block_modules_6_modules_layer_modules_2_modules_DenseReluDense_modules_wo_parameters_weight_ + l_self_modules_block_modules_7_modules_layer_modules_0_modules_layer_norm_parameters_weight_ = L_self_modules_block_modules_7_modules_layer_modules_0_modules_layer_norm_parameters_weight_ + l_self_modules_block_modules_7_modules_layer_modules_0_modules_self_attention_modules_q_parameters_weight_ = L_self_modules_block_modules_7_modules_layer_modules_0_modules_SelfAttention_modules_q_parameters_weight_ + l_self_modules_block_modules_7_modules_layer_modules_0_modules_self_attention_modules_k_parameters_weight_ = L_self_modules_block_modules_7_modules_layer_modules_0_modules_SelfAttention_modules_k_parameters_weight_ + l_self_modules_block_modules_7_modules_layer_modules_0_modules_self_attention_modules_v_parameters_weight_ = L_self_modules_block_modules_7_modules_layer_modules_0_modules_SelfAttention_modules_v_parameters_weight_ + l_self_modules_block_modules_7_modules_layer_modules_0_modules_self_attention_modules_o_parameters_weight_ = L_self_modules_block_modules_7_modules_layer_modules_0_modules_SelfAttention_modules_o_parameters_weight_ + l_self_modules_block_modules_7_modules_layer_modules_1_modules_layer_norm_parameters_weight_ = L_self_modules_block_modules_7_modules_layer_modules_1_modules_layer_norm_parameters_weight_ + l_self_modules_block_modules_7_modules_layer_modules_1_modules_enc_dec_attention_modules_q_parameters_weight_ = L_self_modules_block_modules_7_modules_layer_modules_1_modules_EncDecAttention_modules_q_parameters_weight_ + l_self_modules_block_modules_7_modules_layer_modules_1_modules_enc_dec_attention_modules_k_parameters_weight_ = L_self_modules_block_modules_7_modules_layer_modules_1_modules_EncDecAttention_modules_k_parameters_weight_ + l_self_modules_block_modules_7_modules_layer_modules_1_modules_enc_dec_attention_modules_v_parameters_weight_ = L_self_modules_block_modules_7_modules_layer_modules_1_modules_EncDecAttention_modules_v_parameters_weight_ + l_self_modules_block_modules_7_modules_layer_modules_1_modules_enc_dec_attention_modules_o_parameters_weight_ = L_self_modules_block_modules_7_modules_layer_modules_1_modules_EncDecAttention_modules_o_parameters_weight_ + l_self_modules_block_modules_7_modules_layer_modules_2_modules_layer_norm_parameters_weight_ = L_self_modules_block_modules_7_modules_layer_modules_2_modules_layer_norm_parameters_weight_ + l_self_modules_block_modules_7_modules_layer_modules_2_modules_dense_relu_dense_modules_wi_0_parameters_weight_ = L_self_modules_block_modules_7_modules_layer_modules_2_modules_DenseReluDense_modules_wi_0_parameters_weight_ + l_self_modules_block_modules_7_modules_layer_modules_2_modules_dense_relu_dense_modules_wi_1_parameters_weight_ = L_self_modules_block_modules_7_modules_layer_modules_2_modules_DenseReluDense_modules_wi_1_parameters_weight_ + l_self_modules_block_modules_7_modules_layer_modules_2_modules_dense_relu_dense_modules_wo_parameters_weight_ = L_self_modules_block_modules_7_modules_layer_modules_2_modules_DenseReluDense_modules_wo_parameters_weight_ + l_self_modules_final_layer_norm_parameters_weight_ = ( + L_self_modules_final_layer_norm_parameters_weight_ + ) + cache_position = torch.arange(0, 22, device=device(type="cuda", index=0)) + causal_mask = torch.full( + (22, 23), + fill_value=-3.4028234663852886e38, + dtype=torch.float32, + device=device(type="cuda", index=0), + ) + causal_mask_1 = torch.triu(causal_mask, diagonal=1) + causal_mask = None + arange_1 = torch.arange(23, device=device(type="cuda", index=0)) + reshape = cache_position.reshape(-1, 1) + gt = arange_1 > reshape + arange_1 = reshape = None + causal_mask_1 *= gt + causal_mask_2 = causal_mask_1 + causal_mask_1 = gt = None + getitem = causal_mask_2[ + (None, None, slice(None, None, None), slice(None, None, None)) + ] + causal_mask_2 = None + causal_mask_3 = getitem.expand(1, 1, -1, -1) + getitem = None + encoder_extended_attention_mask = l_encoder_attention_mask_[ + (slice(None, None, None), None, None, slice(None, None, None)) + ] + l_encoder_attention_mask_ = None + encoder_extended_attention_mask_1 = encoder_extended_attention_mask.to( + dtype=torch.float32 + ) + encoder_extended_attention_mask = None + sub = 1.0 - encoder_extended_attention_mask_1 + encoder_extended_attention_mask_1 = None + encoder_extended_attention_mask_2 = sub * -3.4028234663852886e38 + sub = None + hidden_states = torch.nn.functional.dropout(l_inputs_embeds_, 0.1, False, False) + l_inputs_embeds_ = None + to_1 = hidden_states.to(torch.float32) + pow_1 = to_1.pow(2) + to_1 = None + variance = pow_1.mean(-1, keepdim=True) + pow_1 = None + add = variance + 1e-06 + variance = None + rsqrt = torch.rsqrt(add) + add = None + hidden_states_1 = hidden_states * rsqrt + rsqrt = None + normed_hidden_states = ( + l_self_modules_block_modules_0_modules_layer_modules_0_modules_layer_norm_parameters_weight_ + * hidden_states_1 + ) + l_self_modules_block_modules_0_modules_layer_modules_0_modules_layer_norm_parameters_weight_ = ( + hidden_states_1 + ) = None + query_states = torch._C._nn.linear( + normed_hidden_states, + l_self_modules_block_modules_0_modules_layer_modules_0_modules_self_attention_modules_q_parameters_weight_, + None, + ) + l_self_modules_block_modules_0_modules_layer_modules_0_modules_self_attention_modules_q_parameters_weight_ = ( + None + ) + view = query_states.view(1, -1, 6, 64) + query_states = None + query_states_1 = view.transpose(1, 2) + view = None + key_states = torch._C._nn.linear( + normed_hidden_states, + l_self_modules_block_modules_0_modules_layer_modules_0_modules_self_attention_modules_k_parameters_weight_, + None, + ) + l_self_modules_block_modules_0_modules_layer_modules_0_modules_self_attention_modules_k_parameters_weight_ = ( + None + ) + value_states = torch._C._nn.linear( + normed_hidden_states, + l_self_modules_block_modules_0_modules_layer_modules_0_modules_self_attention_modules_v_parameters_weight_, + None, + ) + normed_hidden_states = l_self_modules_block_modules_0_modules_layer_modules_0_modules_self_attention_modules_v_parameters_weight_ = (None) + view_1 = key_states.view(1, -1, 6, 64) + key_states = None + key_states_1 = view_1.transpose(1, 2) + view_1 = None + view_2 = value_states.view(1, -1, 6, 64) + value_states = None + value_states_1 = view_2.transpose(1, 2) + view_2 = None + transpose_3 = key_states_1.transpose(3, 2) + scores = torch.matmul(query_states_1, transpose_3) + query_states_1 = transpose_3 = None + getitem_2 = cache_position[-1] + real_seq_length = getitem_2 + 1 + getitem_2 = real_seq_length = None + getitem_3 = cache_position[(slice(None, None, None), None)] + context_position = getitem_3.to(device(type="cuda", index=0)) + getitem_3 = None + arange_2 = torch.arange( + 22, dtype=torch.int64, device=device(type="cuda", index=0) + ) + memory_position = arange_2[(None, slice(None, None, None))] + arange_2 = None + relative_position = memory_position - context_position + memory_position = context_position = None + zeros_like = torch.zeros_like(relative_position) + min_1 = torch.min(relative_position, zeros_like) + relative_position = zeros_like = None + relative_position_1 = -min_1 + min_1 = None + is_small = relative_position_1 < 16 + float_1 = relative_position_1.float() + truediv = float_1 / 16 + float_1 = None + log = torch.log(truediv) + truediv = None + truediv_1 = log / 2.0794415416798357 + log = None + mul_3 = truediv_1 * 16 + truediv_1 = None + to_3 = mul_3.to(torch.int64) + mul_3 = None + relative_position_if_large = 16 + to_3 + to_3 = None + full_like = torch.full_like(relative_position_if_large, 31) + relative_position_if_large_1 = torch.min(relative_position_if_large, full_like) + relative_position_if_large = full_like = None + where = torch.where(is_small, relative_position_1, relative_position_if_large_1) + is_small = relative_position_1 = relative_position_if_large_1 = None + relative_buckets = 0 + where + where = None + values = torch.nn.functional.embedding( + relative_buckets, + l_self_modules_block_modules_0_modules_layer_modules_0_modules_self_attention_modules_relative_attention_bias_parameters_weight_, + None, + None, + 2.0, + False, + False, + ) + relative_buckets = l_self_modules_block_modules_0_modules_layer_modules_0_modules_self_attention_modules_relative_attention_bias_parameters_weight_ = (None) + permute = values.permute([2, 0, 1]) + values = None + values_1 = permute.unsqueeze(0) + permute = None + position_bias = values_1[ + ( + slice(None, None, None), + slice(None, None, None), + slice(-22, None, None), + slice(None, None, None), + ) + ] + values_1 = None + causal_mask_4 = causal_mask_3[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 22, None), + ) + ] + causal_mask_3 = None + position_bias_1 = position_bias + causal_mask_4 + position_bias = causal_mask_4 = None + scores += position_bias_1 + scores_1 = scores + scores = None + float_2 = scores_1.float() + softmax = torch.nn.functional.softmax(float_2, dim=-1) + float_2 = None + attn_weights = softmax.type_as(scores_1) + softmax = scores_1 = None + attn_weights_1 = torch.nn.functional.dropout( + attn_weights, p=0.1, training=False + ) + attn_weights = None + attn_output = torch.matmul(attn_weights_1, value_states_1) + attn_weights_1 = None + transpose_4 = attn_output.transpose(1, 2) + attn_output = None + attn_output_1 = transpose_4.contiguous() + transpose_4 = None + attn_output_2 = attn_output_1.view(1, -1, 384) + attn_output_1 = None + attn_output_3 = torch._C._nn.linear( + attn_output_2, + l_self_modules_block_modules_0_modules_layer_modules_0_modules_self_attention_modules_o_parameters_weight_, + None, + ) + attn_output_2 = l_self_modules_block_modules_0_modules_layer_modules_0_modules_self_attention_modules_o_parameters_weight_ = (None) + dropout_2 = torch.nn.functional.dropout(attn_output_3, 0.1, False, False) + attn_output_3 = None + hidden_states_2 = hidden_states + dropout_2 + hidden_states = dropout_2 = None + getitem_7 = cache_position[-1] + real_seq_length_1 = getitem_7 + 1 + getitem_7 = real_seq_length_1 = None + to_4 = hidden_states_2.to(torch.float32) + pow_2 = to_4.pow(2) + to_4 = None + variance_1 = pow_2.mean(-1, keepdim=True) + pow_2 = None + add_7 = variance_1 + 1e-06 + variance_1 = None + rsqrt_1 = torch.rsqrt(add_7) + add_7 = None + hidden_states_3 = hidden_states_2 * rsqrt_1 + rsqrt_1 = None + normed_hidden_states_1 = ( + l_self_modules_block_modules_0_modules_layer_modules_1_modules_layer_norm_parameters_weight_ + * hidden_states_3 + ) + l_self_modules_block_modules_0_modules_layer_modules_1_modules_layer_norm_parameters_weight_ = ( + hidden_states_3 + ) = None + query_states_2 = torch._C._nn.linear( + normed_hidden_states_1, + l_self_modules_block_modules_0_modules_layer_modules_1_modules_enc_dec_attention_modules_q_parameters_weight_, + None, + ) + normed_hidden_states_1 = l_self_modules_block_modules_0_modules_layer_modules_1_modules_enc_dec_attention_modules_q_parameters_weight_ = (None) + view_4 = query_states_2.view(1, -1, 6, 64) + query_states_2 = None + query_states_3 = view_4.transpose(1, 2) + view_4 = None + key_states_2 = torch._C._nn.linear( + l_encoder_hidden_states_, + l_self_modules_block_modules_0_modules_layer_modules_1_modules_enc_dec_attention_modules_k_parameters_weight_, + None, + ) + l_self_modules_block_modules_0_modules_layer_modules_1_modules_enc_dec_attention_modules_k_parameters_weight_ = ( + None + ) + value_states_2 = torch._C._nn.linear( + l_encoder_hidden_states_, + l_self_modules_block_modules_0_modules_layer_modules_1_modules_enc_dec_attention_modules_v_parameters_weight_, + None, + ) + l_self_modules_block_modules_0_modules_layer_modules_1_modules_enc_dec_attention_modules_v_parameters_weight_ = ( + None + ) + view_5 = key_states_2.view(1, -1, 6, 64) + key_states_2 = None + key_states_3 = view_5.transpose(1, 2) + view_5 = None + view_6 = value_states_2.view(1, -1, 6, 64) + value_states_2 = None + value_states_3 = view_6.transpose(1, 2) + view_6 = None + transpose_8 = key_states_3.transpose(3, 2) + scores_2 = torch.matmul(query_states_3, transpose_8) + query_states_3 = transpose_8 = None + position_bias_2 = torch.zeros( + (1, 6, 22, 22), device=device(type="cuda", index=0), dtype=torch.float32 + ) + causal_mask_5 = encoder_extended_attention_mask_2[ + ( + slice(None, None, None), + slice(None, None, None), + slice(None, None, None), + slice(None, 22, None), + ) + ] + encoder_extended_attention_mask_2 = None + position_bias_3 = position_bias_2 + causal_mask_5 + position_bias_2 = causal_mask_5 = None + scores_2 += position_bias_3 + scores_3 = scores_2 + scores_2 = None + float_3 = scores_3.float() + softmax_1 = torch.nn.functional.softmax(float_3, dim=-1) + float_3 = None + attn_weights_2 = softmax_1.type_as(scores_3) + softmax_1 = scores_3 = None + attn_weights_3 = torch.nn.functional.dropout( + attn_weights_2, p=0.1, training=False + ) + attn_weights_2 = None + attn_output_4 = torch.matmul(attn_weights_3, value_states_3) + attn_weights_3 = None + transpose_9 = attn_output_4.transpose(1, 2) + attn_output_4 = None + attn_output_5 = transpose_9.contiguous() + transpose_9 = None + attn_output_6 = attn_output_5.view(1, -1, 384) + attn_output_5 = None + attn_output_7 = torch._C._nn.linear( + attn_output_6, + l_self_modules_block_modules_0_modules_layer_modules_1_modules_enc_dec_attention_modules_o_parameters_weight_, + None, + ) + attn_output_6 = l_self_modules_block_modules_0_modules_layer_modules_1_modules_enc_dec_attention_modules_o_parameters_weight_ = (None) + dropout_4 = torch.nn.functional.dropout(attn_output_7, 0.1, False, False) + attn_output_7 = None + layer_output = hidden_states_2 + dropout_4 + hidden_states_2 = dropout_4 = None + to_5 = layer_output.to(torch.float32) + pow_3 = to_5.pow(2) + to_5 = None + variance_2 = pow_3.mean(-1, keepdim=True) + pow_3 = None + add_10 = variance_2 + 1e-06 + variance_2 = None + rsqrt_2 = torch.rsqrt(add_10) + add_10 = None + hidden_states_4 = layer_output * rsqrt_2 + rsqrt_2 = None + forwarded_states = ( + l_self_modules_block_modules_0_modules_layer_modules_2_modules_layer_norm_parameters_weight_ + * hidden_states_4 + ) + l_self_modules_block_modules_0_modules_layer_modules_2_modules_layer_norm_parameters_weight_ = ( + hidden_states_4 + ) = None + linear_8 = torch._C._nn.linear( + forwarded_states, + l_self_modules_block_modules_0_modules_layer_modules_2_modules_dense_relu_dense_modules_wi_0_parameters_weight_, + None, + ) + l_self_modules_block_modules_0_modules_layer_modules_2_modules_dense_relu_dense_modules_wi_0_parameters_weight_ = ( + None + ) + mul_8 = 0.5 * linear_8 + pow_4 = torch.pow(linear_8, 3.0) + mul_9 = 0.044715 * pow_4 + pow_4 = None + add_11 = linear_8 + mul_9 + linear_8 = mul_9 = None + mul_10 = 0.7978845608028654 * add_11 + add_11 = None + tanh = torch.tanh(mul_10) + mul_10 = None + add_12 = 1.0 + tanh + tanh = None + hidden_gelu = mul_8 * add_12 + mul_8 = add_12 = None + hidden_linear = torch._C._nn.linear( + forwarded_states, + l_self_modules_block_modules_0_modules_layer_modules_2_modules_dense_relu_dense_modules_wi_1_parameters_weight_, + None, + ) + forwarded_states = l_self_modules_block_modules_0_modules_layer_modules_2_modules_dense_relu_dense_modules_wi_1_parameters_weight_ = (None) + hidden_states_5 = hidden_gelu * hidden_linear + hidden_gelu = hidden_linear = None + hidden_states_6 = torch.nn.functional.dropout( + hidden_states_5, 0.1, False, False + ) + hidden_states_5 = None + hidden_states_7 = torch._C._nn.linear( + hidden_states_6, + l_self_modules_block_modules_0_modules_layer_modules_2_modules_dense_relu_dense_modules_wo_parameters_weight_, + None, + ) + hidden_states_6 = l_self_modules_block_modules_0_modules_layer_modules_2_modules_dense_relu_dense_modules_wo_parameters_weight_ = (None) + dropout_6 = torch.nn.functional.dropout(hidden_states_7, 0.1, False, False) + hidden_states_7 = None + hidden_states_8 = layer_output + dropout_6 + layer_output = dropout_6 = None + to_6 = hidden_states_8.to(torch.float32) + pow_5 = to_6.pow(2) + to_6 = None + variance_3 = pow_5.mean(-1, keepdim=True) + pow_5 = None + add_14 = variance_3 + 1e-06 + variance_3 = None + rsqrt_3 = torch.rsqrt(add_14) + add_14 = None + hidden_states_9 = hidden_states_8 * rsqrt_3 + rsqrt_3 = None + normed_hidden_states_2 = ( + l_self_modules_block_modules_1_modules_layer_modules_0_modules_layer_norm_parameters_weight_ + * hidden_states_9 + ) + l_self_modules_block_modules_1_modules_layer_modules_0_modules_layer_norm_parameters_weight_ = ( + hidden_states_9 + ) = None + query_states_4 = torch._C._nn.linear( + normed_hidden_states_2, + l_self_modules_block_modules_1_modules_layer_modules_0_modules_self_attention_modules_q_parameters_weight_, + None, + ) + l_self_modules_block_modules_1_modules_layer_modules_0_modules_self_attention_modules_q_parameters_weight_ = ( + None + ) + view_8 = query_states_4.view(1, -1, 6, 64) + query_states_4 = None + query_states_5 = view_8.transpose(1, 2) + view_8 = None + key_states_4 = torch._C._nn.linear( + normed_hidden_states_2, + l_self_modules_block_modules_1_modules_layer_modules_0_modules_self_attention_modules_k_parameters_weight_, + None, + ) + l_self_modules_block_modules_1_modules_layer_modules_0_modules_self_attention_modules_k_parameters_weight_ = ( + None + ) + value_states_4 = torch._C._nn.linear( + normed_hidden_states_2, + l_self_modules_block_modules_1_modules_layer_modules_0_modules_self_attention_modules_v_parameters_weight_, + None, + ) + normed_hidden_states_2 = l_self_modules_block_modules_1_modules_layer_modules_0_modules_self_attention_modules_v_parameters_weight_ = (None) + view_9 = key_states_4.view(1, -1, 6, 64) + key_states_4 = None + key_states_5 = view_9.transpose(1, 2) + view_9 = None + view_10 = value_states_4.view(1, -1, 6, 64) + value_states_4 = None + value_states_5 = view_10.transpose(1, 2) + view_10 = None + transpose_13 = key_states_5.transpose(3, 2) + scores_4 = torch.matmul(query_states_5, transpose_13) + query_states_5 = transpose_13 = None + scores_4 += position_bias_1 + scores_5 = scores_4 + scores_4 = None + float_4 = scores_5.float() + softmax_2 = torch.nn.functional.softmax(float_4, dim=-1) + float_4 = None + attn_weights_4 = softmax_2.type_as(scores_5) + softmax_2 = scores_5 = None + attn_weights_5 = torch.nn.functional.dropout( + attn_weights_4, p=0.1, training=False + ) + attn_weights_4 = None + attn_output_8 = torch.matmul(attn_weights_5, value_states_5) + attn_weights_5 = None + transpose_14 = attn_output_8.transpose(1, 2) + attn_output_8 = None + attn_output_9 = transpose_14.contiguous() + transpose_14 = None + attn_output_10 = attn_output_9.view(1, -1, 384) + attn_output_9 = None + attn_output_11 = torch._C._nn.linear( + attn_output_10, + l_self_modules_block_modules_1_modules_layer_modules_0_modules_self_attention_modules_o_parameters_weight_, + None, + ) + attn_output_10 = l_self_modules_block_modules_1_modules_layer_modules_0_modules_self_attention_modules_o_parameters_weight_ = (None) + dropout_8 = torch.nn.functional.dropout(attn_output_11, 0.1, False, False) + attn_output_11 = None + hidden_states_10 = hidden_states_8 + dropout_8 + hidden_states_8 = dropout_8 = None + getitem_9 = cache_position[-1] + add_16 = getitem_9 + 1 + getitem_9 = add_16 = None + to_7 = hidden_states_10.to(torch.float32) + pow_6 = to_7.pow(2) + to_7 = None + variance_4 = pow_6.mean(-1, keepdim=True) + pow_6 = None + add_17 = variance_4 + 1e-06 + variance_4 = None + rsqrt_4 = torch.rsqrt(add_17) + add_17 = None + hidden_states_11 = hidden_states_10 * rsqrt_4 + rsqrt_4 = None + normed_hidden_states_3 = ( + l_self_modules_block_modules_1_modules_layer_modules_1_modules_layer_norm_parameters_weight_ + * hidden_states_11 + ) + l_self_modules_block_modules_1_modules_layer_modules_1_modules_layer_norm_parameters_weight_ = ( + hidden_states_11 + ) = None + query_states_6 = torch._C._nn.linear( + normed_hidden_states_3, + l_self_modules_block_modules_1_modules_layer_modules_1_modules_enc_dec_attention_modules_q_parameters_weight_, + None, + ) + normed_hidden_states_3 = l_self_modules_block_modules_1_modules_layer_modules_1_modules_enc_dec_attention_modules_q_parameters_weight_ = (None) + view_12 = query_states_6.view(1, -1, 6, 64) + query_states_6 = None + query_states_7 = view_12.transpose(1, 2) + view_12 = None + key_states_6 = torch._C._nn.linear( + l_encoder_hidden_states_, + l_self_modules_block_modules_1_modules_layer_modules_1_modules_enc_dec_attention_modules_k_parameters_weight_, + None, + ) + l_self_modules_block_modules_1_modules_layer_modules_1_modules_enc_dec_attention_modules_k_parameters_weight_ = ( + None + ) + value_states_6 = torch._C._nn.linear( + l_encoder_hidden_states_, + l_self_modules_block_modules_1_modules_layer_modules_1_modules_enc_dec_attention_modules_v_parameters_weight_, + None, + ) + l_self_modules_block_modules_1_modules_layer_modules_1_modules_enc_dec_attention_modules_v_parameters_weight_ = ( + None + ) + view_13 = key_states_6.view(1, -1, 6, 64) + key_states_6 = None + key_states_7 = view_13.transpose(1, 2) + view_13 = None + view_14 = value_states_6.view(1, -1, 6, 64) + value_states_6 = None + value_states_7 = view_14.transpose(1, 2) + view_14 = None + transpose_18 = key_states_7.transpose(3, 2) + scores_6 = torch.matmul(query_states_7, transpose_18) + query_states_7 = transpose_18 = None + scores_6 += position_bias_3 + scores_7 = scores_6 + scores_6 = None + float_5 = scores_7.float() + softmax_3 = torch.nn.functional.softmax(float_5, dim=-1) + float_5 = None + attn_weights_6 = softmax_3.type_as(scores_7) + softmax_3 = scores_7 = None + attn_weights_7 = torch.nn.functional.dropout( + attn_weights_6, p=0.1, training=False + ) + attn_weights_6 = None + attn_output_12 = torch.matmul(attn_weights_7, value_states_7) + attn_weights_7 = None + transpose_19 = attn_output_12.transpose(1, 2) + attn_output_12 = None + attn_output_13 = transpose_19.contiguous() + transpose_19 = None + attn_output_14 = attn_output_13.view(1, -1, 384) + attn_output_13 = None + attn_output_15 = torch._C._nn.linear( + attn_output_14, + l_self_modules_block_modules_1_modules_layer_modules_1_modules_enc_dec_attention_modules_o_parameters_weight_, + None, + ) + attn_output_14 = l_self_modules_block_modules_1_modules_layer_modules_1_modules_enc_dec_attention_modules_o_parameters_weight_ = (None) + dropout_10 = torch.nn.functional.dropout(attn_output_15, 0.1, False, False) + attn_output_15 = None + layer_output_1 = hidden_states_10 + dropout_10 + hidden_states_10 = dropout_10 = None + to_8 = layer_output_1.to(torch.float32) + pow_7 = to_8.pow(2) + to_8 = None + variance_5 = pow_7.mean(-1, keepdim=True) + pow_7 = None + add_19 = variance_5 + 1e-06 + variance_5 = None + rsqrt_5 = torch.rsqrt(add_19) + add_19 = None + hidden_states_12 = layer_output_1 * rsqrt_5 + rsqrt_5 = None + forwarded_states_1 = ( + l_self_modules_block_modules_1_modules_layer_modules_2_modules_layer_norm_parameters_weight_ + * hidden_states_12 + ) + l_self_modules_block_modules_1_modules_layer_modules_2_modules_layer_norm_parameters_weight_ = ( + hidden_states_12 + ) = None + linear_19 = torch._C._nn.linear( + forwarded_states_1, + l_self_modules_block_modules_1_modules_layer_modules_2_modules_dense_relu_dense_modules_wi_0_parameters_weight_, + None, + ) + l_self_modules_block_modules_1_modules_layer_modules_2_modules_dense_relu_dense_modules_wi_0_parameters_weight_ = ( + None + ) + mul_19 = 0.5 * linear_19 + pow_8 = torch.pow(linear_19, 3.0) + mul_20 = 0.044715 * pow_8 + pow_8 = None + add_20 = linear_19 + mul_20 + linear_19 = mul_20 = None + mul_21 = 0.7978845608028654 * add_20 + add_20 = None + tanh_1 = torch.tanh(mul_21) + mul_21 = None + add_21 = 1.0 + tanh_1 + tanh_1 = None + hidden_gelu_1 = mul_19 * add_21 + mul_19 = add_21 = None + hidden_linear_1 = torch._C._nn.linear( + forwarded_states_1, + l_self_modules_block_modules_1_modules_layer_modules_2_modules_dense_relu_dense_modules_wi_1_parameters_weight_, + None, + ) + forwarded_states_1 = l_self_modules_block_modules_1_modules_layer_modules_2_modules_dense_relu_dense_modules_wi_1_parameters_weight_ = (None) + hidden_states_13 = hidden_gelu_1 * hidden_linear_1 + hidden_gelu_1 = hidden_linear_1 = None + hidden_states_14 = torch.nn.functional.dropout( + hidden_states_13, 0.1, False, False + ) + hidden_states_13 = None + hidden_states_15 = torch._C._nn.linear( + hidden_states_14, + l_self_modules_block_modules_1_modules_layer_modules_2_modules_dense_relu_dense_modules_wo_parameters_weight_, + None, + ) + hidden_states_14 = l_self_modules_block_modules_1_modules_layer_modules_2_modules_dense_relu_dense_modules_wo_parameters_weight_ = (None) + dropout_12 = torch.nn.functional.dropout(hidden_states_15, 0.1, False, False) + hidden_states_15 = None + hidden_states_16 = layer_output_1 + dropout_12 + layer_output_1 = dropout_12 = None + to_9 = hidden_states_16.to(torch.float32) + pow_9 = to_9.pow(2) + to_9 = None + variance_6 = pow_9.mean(-1, keepdim=True) + pow_9 = None + add_23 = variance_6 + 1e-06 + variance_6 = None + rsqrt_6 = torch.rsqrt(add_23) + add_23 = None + hidden_states_17 = hidden_states_16 * rsqrt_6 + rsqrt_6 = None + normed_hidden_states_4 = ( + l_self_modules_block_modules_2_modules_layer_modules_0_modules_layer_norm_parameters_weight_ + * hidden_states_17 + ) + l_self_modules_block_modules_2_modules_layer_modules_0_modules_layer_norm_parameters_weight_ = ( + hidden_states_17 + ) = None + query_states_8 = torch._C._nn.linear( + normed_hidden_states_4, + l_self_modules_block_modules_2_modules_layer_modules_0_modules_self_attention_modules_q_parameters_weight_, + None, + ) + l_self_modules_block_modules_2_modules_layer_modules_0_modules_self_attention_modules_q_parameters_weight_ = ( + None + ) + view_16 = query_states_8.view(1, -1, 6, 64) + query_states_8 = None + query_states_9 = view_16.transpose(1, 2) + view_16 = None + key_states_8 = torch._C._nn.linear( + normed_hidden_states_4, + l_self_modules_block_modules_2_modules_layer_modules_0_modules_self_attention_modules_k_parameters_weight_, + None, + ) + l_self_modules_block_modules_2_modules_layer_modules_0_modules_self_attention_modules_k_parameters_weight_ = ( + None + ) + value_states_8 = torch._C._nn.linear( + normed_hidden_states_4, + l_self_modules_block_modules_2_modules_layer_modules_0_modules_self_attention_modules_v_parameters_weight_, + None, + ) + normed_hidden_states_4 = l_self_modules_block_modules_2_modules_layer_modules_0_modules_self_attention_modules_v_parameters_weight_ = (None) + view_17 = key_states_8.view(1, -1, 6, 64) + key_states_8 = None + key_states_9 = view_17.transpose(1, 2) + view_17 = None + view_18 = value_states_8.view(1, -1, 6, 64) + value_states_8 = None + value_states_9 = view_18.transpose(1, 2) + view_18 = None + transpose_23 = key_states_9.transpose(3, 2) + scores_8 = torch.matmul(query_states_9, transpose_23) + query_states_9 = transpose_23 = None + scores_8 += position_bias_1 + scores_9 = scores_8 + scores_8 = None + float_6 = scores_9.float() + softmax_4 = torch.nn.functional.softmax(float_6, dim=-1) + float_6 = None + attn_weights_8 = softmax_4.type_as(scores_9) + softmax_4 = scores_9 = None + attn_weights_9 = torch.nn.functional.dropout( + attn_weights_8, p=0.1, training=False + ) + attn_weights_8 = None + attn_output_16 = torch.matmul(attn_weights_9, value_states_9) + attn_weights_9 = None + transpose_24 = attn_output_16.transpose(1, 2) + attn_output_16 = None + attn_output_17 = transpose_24.contiguous() + transpose_24 = None + attn_output_18 = attn_output_17.view(1, -1, 384) + attn_output_17 = None + attn_output_19 = torch._C._nn.linear( + attn_output_18, + l_self_modules_block_modules_2_modules_layer_modules_0_modules_self_attention_modules_o_parameters_weight_, + None, + ) + attn_output_18 = l_self_modules_block_modules_2_modules_layer_modules_0_modules_self_attention_modules_o_parameters_weight_ = (None) + dropout_14 = torch.nn.functional.dropout(attn_output_19, 0.1, False, False) + attn_output_19 = None + hidden_states_18 = hidden_states_16 + dropout_14 + hidden_states_16 = dropout_14 = None + getitem_10 = cache_position[-1] + add_25 = getitem_10 + 1 + getitem_10 = add_25 = None + to_10 = hidden_states_18.to(torch.float32) + pow_10 = to_10.pow(2) + to_10 = None + variance_7 = pow_10.mean(-1, keepdim=True) + pow_10 = None + add_26 = variance_7 + 1e-06 + variance_7 = None + rsqrt_7 = torch.rsqrt(add_26) + add_26 = None + hidden_states_19 = hidden_states_18 * rsqrt_7 + rsqrt_7 = None + normed_hidden_states_5 = ( + l_self_modules_block_modules_2_modules_layer_modules_1_modules_layer_norm_parameters_weight_ + * hidden_states_19 + ) + l_self_modules_block_modules_2_modules_layer_modules_1_modules_layer_norm_parameters_weight_ = ( + hidden_states_19 + ) = None + query_states_10 = torch._C._nn.linear( + normed_hidden_states_5, + l_self_modules_block_modules_2_modules_layer_modules_1_modules_enc_dec_attention_modules_q_parameters_weight_, + None, + ) + normed_hidden_states_5 = l_self_modules_block_modules_2_modules_layer_modules_1_modules_enc_dec_attention_modules_q_parameters_weight_ = (None) + view_20 = query_states_10.view(1, -1, 6, 64) + query_states_10 = None + query_states_11 = view_20.transpose(1, 2) + view_20 = None + key_states_10 = torch._C._nn.linear( + l_encoder_hidden_states_, + l_self_modules_block_modules_2_modules_layer_modules_1_modules_enc_dec_attention_modules_k_parameters_weight_, + None, + ) + l_self_modules_block_modules_2_modules_layer_modules_1_modules_enc_dec_attention_modules_k_parameters_weight_ = ( + None + ) + value_states_10 = torch._C._nn.linear( + l_encoder_hidden_states_, + l_self_modules_block_modules_2_modules_layer_modules_1_modules_enc_dec_attention_modules_v_parameters_weight_, + None, + ) + l_self_modules_block_modules_2_modules_layer_modules_1_modules_enc_dec_attention_modules_v_parameters_weight_ = ( + None + ) + view_21 = key_states_10.view(1, -1, 6, 64) + key_states_10 = None + key_states_11 = view_21.transpose(1, 2) + view_21 = None + view_22 = value_states_10.view(1, -1, 6, 64) + value_states_10 = None + value_states_11 = view_22.transpose(1, 2) + view_22 = None + transpose_28 = key_states_11.transpose(3, 2) + scores_10 = torch.matmul(query_states_11, transpose_28) + query_states_11 = transpose_28 = None + scores_10 += position_bias_3 + scores_11 = scores_10 + scores_10 = None + float_7 = scores_11.float() + softmax_5 = torch.nn.functional.softmax(float_7, dim=-1) + float_7 = None + attn_weights_10 = softmax_5.type_as(scores_11) + softmax_5 = scores_11 = None + attn_weights_11 = torch.nn.functional.dropout( + attn_weights_10, p=0.1, training=False + ) + attn_weights_10 = None + attn_output_20 = torch.matmul(attn_weights_11, value_states_11) + attn_weights_11 = None + transpose_29 = attn_output_20.transpose(1, 2) + attn_output_20 = None + attn_output_21 = transpose_29.contiguous() + transpose_29 = None + attn_output_22 = attn_output_21.view(1, -1, 384) + attn_output_21 = None + attn_output_23 = torch._C._nn.linear( + attn_output_22, + l_self_modules_block_modules_2_modules_layer_modules_1_modules_enc_dec_attention_modules_o_parameters_weight_, + None, + ) + attn_output_22 = l_self_modules_block_modules_2_modules_layer_modules_1_modules_enc_dec_attention_modules_o_parameters_weight_ = (None) + dropout_16 = torch.nn.functional.dropout(attn_output_23, 0.1, False, False) + attn_output_23 = None + layer_output_2 = hidden_states_18 + dropout_16 + hidden_states_18 = dropout_16 = None + to_11 = layer_output_2.to(torch.float32) + pow_11 = to_11.pow(2) + to_11 = None + variance_8 = pow_11.mean(-1, keepdim=True) + pow_11 = None + add_28 = variance_8 + 1e-06 + variance_8 = None + rsqrt_8 = torch.rsqrt(add_28) + add_28 = None + hidden_states_20 = layer_output_2 * rsqrt_8 + rsqrt_8 = None + forwarded_states_2 = ( + l_self_modules_block_modules_2_modules_layer_modules_2_modules_layer_norm_parameters_weight_ + * hidden_states_20 + ) + l_self_modules_block_modules_2_modules_layer_modules_2_modules_layer_norm_parameters_weight_ = ( + hidden_states_20 + ) = None + linear_30 = torch._C._nn.linear( + forwarded_states_2, + l_self_modules_block_modules_2_modules_layer_modules_2_modules_dense_relu_dense_modules_wi_0_parameters_weight_, + None, + ) + l_self_modules_block_modules_2_modules_layer_modules_2_modules_dense_relu_dense_modules_wi_0_parameters_weight_ = ( + None + ) + mul_30 = 0.5 * linear_30 + pow_12 = torch.pow(linear_30, 3.0) + mul_31 = 0.044715 * pow_12 + pow_12 = None + add_29 = linear_30 + mul_31 + linear_30 = mul_31 = None + mul_32 = 0.7978845608028654 * add_29 + add_29 = None + tanh_2 = torch.tanh(mul_32) + mul_32 = None + add_30 = 1.0 + tanh_2 + tanh_2 = None + hidden_gelu_2 = mul_30 * add_30 + mul_30 = add_30 = None + hidden_linear_2 = torch._C._nn.linear( + forwarded_states_2, + l_self_modules_block_modules_2_modules_layer_modules_2_modules_dense_relu_dense_modules_wi_1_parameters_weight_, + None, + ) + forwarded_states_2 = l_self_modules_block_modules_2_modules_layer_modules_2_modules_dense_relu_dense_modules_wi_1_parameters_weight_ = (None) + hidden_states_21 = hidden_gelu_2 * hidden_linear_2 + hidden_gelu_2 = hidden_linear_2 = None + hidden_states_22 = torch.nn.functional.dropout( + hidden_states_21, 0.1, False, False + ) + hidden_states_21 = None + hidden_states_23 = torch._C._nn.linear( + hidden_states_22, + l_self_modules_block_modules_2_modules_layer_modules_2_modules_dense_relu_dense_modules_wo_parameters_weight_, + None, + ) + hidden_states_22 = l_self_modules_block_modules_2_modules_layer_modules_2_modules_dense_relu_dense_modules_wo_parameters_weight_ = (None) + dropout_18 = torch.nn.functional.dropout(hidden_states_23, 0.1, False, False) + hidden_states_23 = None + hidden_states_24 = layer_output_2 + dropout_18 + layer_output_2 = dropout_18 = None + to_12 = hidden_states_24.to(torch.float32) + pow_13 = to_12.pow(2) + to_12 = None + variance_9 = pow_13.mean(-1, keepdim=True) + pow_13 = None + add_32 = variance_9 + 1e-06 + variance_9 = None + rsqrt_9 = torch.rsqrt(add_32) + add_32 = None + hidden_states_25 = hidden_states_24 * rsqrt_9 + rsqrt_9 = None + normed_hidden_states_6 = ( + l_self_modules_block_modules_3_modules_layer_modules_0_modules_layer_norm_parameters_weight_ + * hidden_states_25 + ) + l_self_modules_block_modules_3_modules_layer_modules_0_modules_layer_norm_parameters_weight_ = ( + hidden_states_25 + ) = None + query_states_12 = torch._C._nn.linear( + normed_hidden_states_6, + l_self_modules_block_modules_3_modules_layer_modules_0_modules_self_attention_modules_q_parameters_weight_, + None, + ) + l_self_modules_block_modules_3_modules_layer_modules_0_modules_self_attention_modules_q_parameters_weight_ = ( + None + ) + view_24 = query_states_12.view(1, -1, 6, 64) + query_states_12 = None + query_states_13 = view_24.transpose(1, 2) + view_24 = None + key_states_12 = torch._C._nn.linear( + normed_hidden_states_6, + l_self_modules_block_modules_3_modules_layer_modules_0_modules_self_attention_modules_k_parameters_weight_, + None, + ) + l_self_modules_block_modules_3_modules_layer_modules_0_modules_self_attention_modules_k_parameters_weight_ = ( + None + ) + value_states_12 = torch._C._nn.linear( + normed_hidden_states_6, + l_self_modules_block_modules_3_modules_layer_modules_0_modules_self_attention_modules_v_parameters_weight_, + None, + ) + normed_hidden_states_6 = l_self_modules_block_modules_3_modules_layer_modules_0_modules_self_attention_modules_v_parameters_weight_ = (None) + view_25 = key_states_12.view(1, -1, 6, 64) + key_states_12 = None + key_states_13 = view_25.transpose(1, 2) + view_25 = None + view_26 = value_states_12.view(1, -1, 6, 64) + value_states_12 = None + value_states_13 = view_26.transpose(1, 2) + view_26 = None + transpose_33 = key_states_13.transpose(3, 2) + scores_12 = torch.matmul(query_states_13, transpose_33) + query_states_13 = transpose_33 = None + scores_12 += position_bias_1 + scores_13 = scores_12 + scores_12 = None + float_8 = scores_13.float() + softmax_6 = torch.nn.functional.softmax(float_8, dim=-1) + float_8 = None + attn_weights_12 = softmax_6.type_as(scores_13) + softmax_6 = scores_13 = None + attn_weights_13 = torch.nn.functional.dropout( + attn_weights_12, p=0.1, training=False + ) + attn_weights_12 = None + attn_output_24 = torch.matmul(attn_weights_13, value_states_13) + attn_weights_13 = None + transpose_34 = attn_output_24.transpose(1, 2) + attn_output_24 = None + attn_output_25 = transpose_34.contiguous() + transpose_34 = None + attn_output_26 = attn_output_25.view(1, -1, 384) + attn_output_25 = None + attn_output_27 = torch._C._nn.linear( + attn_output_26, + l_self_modules_block_modules_3_modules_layer_modules_0_modules_self_attention_modules_o_parameters_weight_, + None, + ) + attn_output_26 = l_self_modules_block_modules_3_modules_layer_modules_0_modules_self_attention_modules_o_parameters_weight_ = (None) + dropout_20 = torch.nn.functional.dropout(attn_output_27, 0.1, False, False) + attn_output_27 = None + hidden_states_26 = hidden_states_24 + dropout_20 + hidden_states_24 = dropout_20 = None + getitem_11 = cache_position[-1] + add_34 = getitem_11 + 1 + getitem_11 = add_34 = None + to_13 = hidden_states_26.to(torch.float32) + pow_14 = to_13.pow(2) + to_13 = None + variance_10 = pow_14.mean(-1, keepdim=True) + pow_14 = None + add_35 = variance_10 + 1e-06 + variance_10 = None + rsqrt_10 = torch.rsqrt(add_35) + add_35 = None + hidden_states_27 = hidden_states_26 * rsqrt_10 + rsqrt_10 = None + normed_hidden_states_7 = ( + l_self_modules_block_modules_3_modules_layer_modules_1_modules_layer_norm_parameters_weight_ + * hidden_states_27 + ) + l_self_modules_block_modules_3_modules_layer_modules_1_modules_layer_norm_parameters_weight_ = ( + hidden_states_27 + ) = None + query_states_14 = torch._C._nn.linear( + normed_hidden_states_7, + l_self_modules_block_modules_3_modules_layer_modules_1_modules_enc_dec_attention_modules_q_parameters_weight_, + None, + ) + normed_hidden_states_7 = l_self_modules_block_modules_3_modules_layer_modules_1_modules_enc_dec_attention_modules_q_parameters_weight_ = (None) + view_28 = query_states_14.view(1, -1, 6, 64) + query_states_14 = None + query_states_15 = view_28.transpose(1, 2) + view_28 = None + key_states_14 = torch._C._nn.linear( + l_encoder_hidden_states_, + l_self_modules_block_modules_3_modules_layer_modules_1_modules_enc_dec_attention_modules_k_parameters_weight_, + None, + ) + l_self_modules_block_modules_3_modules_layer_modules_1_modules_enc_dec_attention_modules_k_parameters_weight_ = ( + None + ) + value_states_14 = torch._C._nn.linear( + l_encoder_hidden_states_, + l_self_modules_block_modules_3_modules_layer_modules_1_modules_enc_dec_attention_modules_v_parameters_weight_, + None, + ) + l_self_modules_block_modules_3_modules_layer_modules_1_modules_enc_dec_attention_modules_v_parameters_weight_ = ( + None + ) + view_29 = key_states_14.view(1, -1, 6, 64) + key_states_14 = None + key_states_15 = view_29.transpose(1, 2) + view_29 = None + view_30 = value_states_14.view(1, -1, 6, 64) + value_states_14 = None + value_states_15 = view_30.transpose(1, 2) + view_30 = None + transpose_38 = key_states_15.transpose(3, 2) + scores_14 = torch.matmul(query_states_15, transpose_38) + query_states_15 = transpose_38 = None + scores_14 += position_bias_3 + scores_15 = scores_14 + scores_14 = None + float_9 = scores_15.float() + softmax_7 = torch.nn.functional.softmax(float_9, dim=-1) + float_9 = None + attn_weights_14 = softmax_7.type_as(scores_15) + softmax_7 = scores_15 = None + attn_weights_15 = torch.nn.functional.dropout( + attn_weights_14, p=0.1, training=False + ) + attn_weights_14 = None + attn_output_28 = torch.matmul(attn_weights_15, value_states_15) + attn_weights_15 = None + transpose_39 = attn_output_28.transpose(1, 2) + attn_output_28 = None + attn_output_29 = transpose_39.contiguous() + transpose_39 = None + attn_output_30 = attn_output_29.view(1, -1, 384) + attn_output_29 = None + attn_output_31 = torch._C._nn.linear( + attn_output_30, + l_self_modules_block_modules_3_modules_layer_modules_1_modules_enc_dec_attention_modules_o_parameters_weight_, + None, + ) + attn_output_30 = l_self_modules_block_modules_3_modules_layer_modules_1_modules_enc_dec_attention_modules_o_parameters_weight_ = (None) + dropout_22 = torch.nn.functional.dropout(attn_output_31, 0.1, False, False) + attn_output_31 = None + layer_output_3 = hidden_states_26 + dropout_22 + hidden_states_26 = dropout_22 = None + to_14 = layer_output_3.to(torch.float32) + pow_15 = to_14.pow(2) + to_14 = None + variance_11 = pow_15.mean(-1, keepdim=True) + pow_15 = None + add_37 = variance_11 + 1e-06 + variance_11 = None + rsqrt_11 = torch.rsqrt(add_37) + add_37 = None + hidden_states_28 = layer_output_3 * rsqrt_11 + rsqrt_11 = None + forwarded_states_3 = ( + l_self_modules_block_modules_3_modules_layer_modules_2_modules_layer_norm_parameters_weight_ + * hidden_states_28 + ) + l_self_modules_block_modules_3_modules_layer_modules_2_modules_layer_norm_parameters_weight_ = ( + hidden_states_28 + ) = None + linear_41 = torch._C._nn.linear( + forwarded_states_3, + l_self_modules_block_modules_3_modules_layer_modules_2_modules_dense_relu_dense_modules_wi_0_parameters_weight_, + None, + ) + l_self_modules_block_modules_3_modules_layer_modules_2_modules_dense_relu_dense_modules_wi_0_parameters_weight_ = ( + None + ) + mul_41 = 0.5 * linear_41 + pow_16 = torch.pow(linear_41, 3.0) + mul_42 = 0.044715 * pow_16 + pow_16 = None + add_38 = linear_41 + mul_42 + linear_41 = mul_42 = None + mul_43 = 0.7978845608028654 * add_38 + add_38 = None + tanh_3 = torch.tanh(mul_43) + mul_43 = None + add_39 = 1.0 + tanh_3 + tanh_3 = None + hidden_gelu_3 = mul_41 * add_39 + mul_41 = add_39 = None + hidden_linear_3 = torch._C._nn.linear( + forwarded_states_3, + l_self_modules_block_modules_3_modules_layer_modules_2_modules_dense_relu_dense_modules_wi_1_parameters_weight_, + None, + ) + forwarded_states_3 = l_self_modules_block_modules_3_modules_layer_modules_2_modules_dense_relu_dense_modules_wi_1_parameters_weight_ = (None) + hidden_states_29 = hidden_gelu_3 * hidden_linear_3 + hidden_gelu_3 = hidden_linear_3 = None + hidden_states_30 = torch.nn.functional.dropout( + hidden_states_29, 0.1, False, False + ) + hidden_states_29 = None + hidden_states_31 = torch._C._nn.linear( + hidden_states_30, + l_self_modules_block_modules_3_modules_layer_modules_2_modules_dense_relu_dense_modules_wo_parameters_weight_, + None, + ) + hidden_states_30 = l_self_modules_block_modules_3_modules_layer_modules_2_modules_dense_relu_dense_modules_wo_parameters_weight_ = (None) + dropout_24 = torch.nn.functional.dropout(hidden_states_31, 0.1, False, False) + hidden_states_31 = None + hidden_states_32 = layer_output_3 + dropout_24 + layer_output_3 = dropout_24 = None + to_15 = hidden_states_32.to(torch.float32) + pow_17 = to_15.pow(2) + to_15 = None + variance_12 = pow_17.mean(-1, keepdim=True) + pow_17 = None + add_41 = variance_12 + 1e-06 + variance_12 = None + rsqrt_12 = torch.rsqrt(add_41) + add_41 = None + hidden_states_33 = hidden_states_32 * rsqrt_12 + rsqrt_12 = None + normed_hidden_states_8 = ( + l_self_modules_block_modules_4_modules_layer_modules_0_modules_layer_norm_parameters_weight_ + * hidden_states_33 + ) + l_self_modules_block_modules_4_modules_layer_modules_0_modules_layer_norm_parameters_weight_ = ( + hidden_states_33 + ) = None + query_states_16 = torch._C._nn.linear( + normed_hidden_states_8, + l_self_modules_block_modules_4_modules_layer_modules_0_modules_self_attention_modules_q_parameters_weight_, + None, + ) + l_self_modules_block_modules_4_modules_layer_modules_0_modules_self_attention_modules_q_parameters_weight_ = ( + None + ) + view_32 = query_states_16.view(1, -1, 6, 64) + query_states_16 = None + query_states_17 = view_32.transpose(1, 2) + view_32 = None + key_states_16 = torch._C._nn.linear( + normed_hidden_states_8, + l_self_modules_block_modules_4_modules_layer_modules_0_modules_self_attention_modules_k_parameters_weight_, + None, + ) + l_self_modules_block_modules_4_modules_layer_modules_0_modules_self_attention_modules_k_parameters_weight_ = ( + None + ) + value_states_16 = torch._C._nn.linear( + normed_hidden_states_8, + l_self_modules_block_modules_4_modules_layer_modules_0_modules_self_attention_modules_v_parameters_weight_, + None, + ) + normed_hidden_states_8 = l_self_modules_block_modules_4_modules_layer_modules_0_modules_self_attention_modules_v_parameters_weight_ = (None) + view_33 = key_states_16.view(1, -1, 6, 64) + key_states_16 = None + key_states_17 = view_33.transpose(1, 2) + view_33 = None + view_34 = value_states_16.view(1, -1, 6, 64) + value_states_16 = None + value_states_17 = view_34.transpose(1, 2) + view_34 = None + transpose_43 = key_states_17.transpose(3, 2) + scores_16 = torch.matmul(query_states_17, transpose_43) + query_states_17 = transpose_43 = None + scores_16 += position_bias_1 + scores_17 = scores_16 + scores_16 = None + float_10 = scores_17.float() + softmax_8 = torch.nn.functional.softmax(float_10, dim=-1) + float_10 = None + attn_weights_16 = softmax_8.type_as(scores_17) + softmax_8 = scores_17 = None + attn_weights_17 = torch.nn.functional.dropout( + attn_weights_16, p=0.1, training=False + ) + attn_weights_16 = None + attn_output_32 = torch.matmul(attn_weights_17, value_states_17) + attn_weights_17 = None + transpose_44 = attn_output_32.transpose(1, 2) + attn_output_32 = None + attn_output_33 = transpose_44.contiguous() + transpose_44 = None + attn_output_34 = attn_output_33.view(1, -1, 384) + attn_output_33 = None + attn_output_35 = torch._C._nn.linear( + attn_output_34, + l_self_modules_block_modules_4_modules_layer_modules_0_modules_self_attention_modules_o_parameters_weight_, + None, + ) + attn_output_34 = l_self_modules_block_modules_4_modules_layer_modules_0_modules_self_attention_modules_o_parameters_weight_ = (None) + dropout_26 = torch.nn.functional.dropout(attn_output_35, 0.1, False, False) + attn_output_35 = None + hidden_states_34 = hidden_states_32 + dropout_26 + hidden_states_32 = dropout_26 = None + getitem_12 = cache_position[-1] + add_43 = getitem_12 + 1 + getitem_12 = add_43 = None + to_16 = hidden_states_34.to(torch.float32) + pow_18 = to_16.pow(2) + to_16 = None + variance_13 = pow_18.mean(-1, keepdim=True) + pow_18 = None + add_44 = variance_13 + 1e-06 + variance_13 = None + rsqrt_13 = torch.rsqrt(add_44) + add_44 = None + hidden_states_35 = hidden_states_34 * rsqrt_13 + rsqrt_13 = None + normed_hidden_states_9 = ( + l_self_modules_block_modules_4_modules_layer_modules_1_modules_layer_norm_parameters_weight_ + * hidden_states_35 + ) + l_self_modules_block_modules_4_modules_layer_modules_1_modules_layer_norm_parameters_weight_ = ( + hidden_states_35 + ) = None + query_states_18 = torch._C._nn.linear( + normed_hidden_states_9, + l_self_modules_block_modules_4_modules_layer_modules_1_modules_enc_dec_attention_modules_q_parameters_weight_, + None, + ) + normed_hidden_states_9 = l_self_modules_block_modules_4_modules_layer_modules_1_modules_enc_dec_attention_modules_q_parameters_weight_ = (None) + view_36 = query_states_18.view(1, -1, 6, 64) + query_states_18 = None + query_states_19 = view_36.transpose(1, 2) + view_36 = None + key_states_18 = torch._C._nn.linear( + l_encoder_hidden_states_, + l_self_modules_block_modules_4_modules_layer_modules_1_modules_enc_dec_attention_modules_k_parameters_weight_, + None, + ) + l_self_modules_block_modules_4_modules_layer_modules_1_modules_enc_dec_attention_modules_k_parameters_weight_ = ( + None + ) + value_states_18 = torch._C._nn.linear( + l_encoder_hidden_states_, + l_self_modules_block_modules_4_modules_layer_modules_1_modules_enc_dec_attention_modules_v_parameters_weight_, + None, + ) + l_self_modules_block_modules_4_modules_layer_modules_1_modules_enc_dec_attention_modules_v_parameters_weight_ = ( + None + ) + view_37 = key_states_18.view(1, -1, 6, 64) + key_states_18 = None + key_states_19 = view_37.transpose(1, 2) + view_37 = None + view_38 = value_states_18.view(1, -1, 6, 64) + value_states_18 = None + value_states_19 = view_38.transpose(1, 2) + view_38 = None + transpose_48 = key_states_19.transpose(3, 2) + scores_18 = torch.matmul(query_states_19, transpose_48) + query_states_19 = transpose_48 = None + scores_18 += position_bias_3 + scores_19 = scores_18 + scores_18 = None + float_11 = scores_19.float() + softmax_9 = torch.nn.functional.softmax(float_11, dim=-1) + float_11 = None + attn_weights_18 = softmax_9.type_as(scores_19) + softmax_9 = scores_19 = None + attn_weights_19 = torch.nn.functional.dropout( + attn_weights_18, p=0.1, training=False + ) + attn_weights_18 = None + attn_output_36 = torch.matmul(attn_weights_19, value_states_19) + attn_weights_19 = None + transpose_49 = attn_output_36.transpose(1, 2) + attn_output_36 = None + attn_output_37 = transpose_49.contiguous() + transpose_49 = None + attn_output_38 = attn_output_37.view(1, -1, 384) + attn_output_37 = None + attn_output_39 = torch._C._nn.linear( + attn_output_38, + l_self_modules_block_modules_4_modules_layer_modules_1_modules_enc_dec_attention_modules_o_parameters_weight_, + None, + ) + attn_output_38 = l_self_modules_block_modules_4_modules_layer_modules_1_modules_enc_dec_attention_modules_o_parameters_weight_ = (None) + dropout_28 = torch.nn.functional.dropout(attn_output_39, 0.1, False, False) + attn_output_39 = None + layer_output_4 = hidden_states_34 + dropout_28 + hidden_states_34 = dropout_28 = None + to_17 = layer_output_4.to(torch.float32) + pow_19 = to_17.pow(2) + to_17 = None + variance_14 = pow_19.mean(-1, keepdim=True) + pow_19 = None + add_46 = variance_14 + 1e-06 + variance_14 = None + rsqrt_14 = torch.rsqrt(add_46) + add_46 = None + hidden_states_36 = layer_output_4 * rsqrt_14 + rsqrt_14 = None + forwarded_states_4 = ( + l_self_modules_block_modules_4_modules_layer_modules_2_modules_layer_norm_parameters_weight_ + * hidden_states_36 + ) + l_self_modules_block_modules_4_modules_layer_modules_2_modules_layer_norm_parameters_weight_ = ( + hidden_states_36 + ) = None + linear_52 = torch._C._nn.linear( + forwarded_states_4, + l_self_modules_block_modules_4_modules_layer_modules_2_modules_dense_relu_dense_modules_wi_0_parameters_weight_, + None, + ) + l_self_modules_block_modules_4_modules_layer_modules_2_modules_dense_relu_dense_modules_wi_0_parameters_weight_ = ( + None + ) + mul_52 = 0.5 * linear_52 + pow_20 = torch.pow(linear_52, 3.0) + mul_53 = 0.044715 * pow_20 + pow_20 = None + add_47 = linear_52 + mul_53 + linear_52 = mul_53 = None + mul_54 = 0.7978845608028654 * add_47 + add_47 = None + tanh_4 = torch.tanh(mul_54) + mul_54 = None + add_48 = 1.0 + tanh_4 + tanh_4 = None + hidden_gelu_4 = mul_52 * add_48 + mul_52 = add_48 = None + hidden_linear_4 = torch._C._nn.linear( + forwarded_states_4, + l_self_modules_block_modules_4_modules_layer_modules_2_modules_dense_relu_dense_modules_wi_1_parameters_weight_, + None, + ) + forwarded_states_4 = l_self_modules_block_modules_4_modules_layer_modules_2_modules_dense_relu_dense_modules_wi_1_parameters_weight_ = (None) + hidden_states_37 = hidden_gelu_4 * hidden_linear_4 + hidden_gelu_4 = hidden_linear_4 = None + hidden_states_38 = torch.nn.functional.dropout( + hidden_states_37, 0.1, False, False + ) + hidden_states_37 = None + hidden_states_39 = torch._C._nn.linear( + hidden_states_38, + l_self_modules_block_modules_4_modules_layer_modules_2_modules_dense_relu_dense_modules_wo_parameters_weight_, + None, + ) + hidden_states_38 = l_self_modules_block_modules_4_modules_layer_modules_2_modules_dense_relu_dense_modules_wo_parameters_weight_ = (None) + dropout_30 = torch.nn.functional.dropout(hidden_states_39, 0.1, False, False) + hidden_states_39 = None + hidden_states_40 = layer_output_4 + dropout_30 + layer_output_4 = dropout_30 = None + to_18 = hidden_states_40.to(torch.float32) + pow_21 = to_18.pow(2) + to_18 = None + variance_15 = pow_21.mean(-1, keepdim=True) + pow_21 = None + add_50 = variance_15 + 1e-06 + variance_15 = None + rsqrt_15 = torch.rsqrt(add_50) + add_50 = None + hidden_states_41 = hidden_states_40 * rsqrt_15 + rsqrt_15 = None + normed_hidden_states_10 = ( + l_self_modules_block_modules_5_modules_layer_modules_0_modules_layer_norm_parameters_weight_ + * hidden_states_41 + ) + l_self_modules_block_modules_5_modules_layer_modules_0_modules_layer_norm_parameters_weight_ = ( + hidden_states_41 + ) = None + query_states_20 = torch._C._nn.linear( + normed_hidden_states_10, + l_self_modules_block_modules_5_modules_layer_modules_0_modules_self_attention_modules_q_parameters_weight_, + None, + ) + l_self_modules_block_modules_5_modules_layer_modules_0_modules_self_attention_modules_q_parameters_weight_ = ( + None + ) + view_40 = query_states_20.view(1, -1, 6, 64) + query_states_20 = None + query_states_21 = view_40.transpose(1, 2) + view_40 = None + key_states_20 = torch._C._nn.linear( + normed_hidden_states_10, + l_self_modules_block_modules_5_modules_layer_modules_0_modules_self_attention_modules_k_parameters_weight_, + None, + ) + l_self_modules_block_modules_5_modules_layer_modules_0_modules_self_attention_modules_k_parameters_weight_ = ( + None + ) + value_states_20 = torch._C._nn.linear( + normed_hidden_states_10, + l_self_modules_block_modules_5_modules_layer_modules_0_modules_self_attention_modules_v_parameters_weight_, + None, + ) + normed_hidden_states_10 = l_self_modules_block_modules_5_modules_layer_modules_0_modules_self_attention_modules_v_parameters_weight_ = (None) + view_41 = key_states_20.view(1, -1, 6, 64) + key_states_20 = None + key_states_21 = view_41.transpose(1, 2) + view_41 = None + view_42 = value_states_20.view(1, -1, 6, 64) + value_states_20 = None + value_states_21 = view_42.transpose(1, 2) + view_42 = None + transpose_53 = key_states_21.transpose(3, 2) + scores_20 = torch.matmul(query_states_21, transpose_53) + query_states_21 = transpose_53 = None + scores_20 += position_bias_1 + scores_21 = scores_20 + scores_20 = None + float_12 = scores_21.float() + softmax_10 = torch.nn.functional.softmax(float_12, dim=-1) + float_12 = None + attn_weights_20 = softmax_10.type_as(scores_21) + softmax_10 = scores_21 = None + attn_weights_21 = torch.nn.functional.dropout( + attn_weights_20, p=0.1, training=False + ) + attn_weights_20 = None + attn_output_40 = torch.matmul(attn_weights_21, value_states_21) + attn_weights_21 = None + transpose_54 = attn_output_40.transpose(1, 2) + attn_output_40 = None + attn_output_41 = transpose_54.contiguous() + transpose_54 = None + attn_output_42 = attn_output_41.view(1, -1, 384) + attn_output_41 = None + attn_output_43 = torch._C._nn.linear( + attn_output_42, + l_self_modules_block_modules_5_modules_layer_modules_0_modules_self_attention_modules_o_parameters_weight_, + None, + ) + attn_output_42 = l_self_modules_block_modules_5_modules_layer_modules_0_modules_self_attention_modules_o_parameters_weight_ = (None) + dropout_32 = torch.nn.functional.dropout(attn_output_43, 0.1, False, False) + attn_output_43 = None + hidden_states_42 = hidden_states_40 + dropout_32 + hidden_states_40 = dropout_32 = None + getitem_13 = cache_position[-1] + add_52 = getitem_13 + 1 + getitem_13 = add_52 = None + to_19 = hidden_states_42.to(torch.float32) + pow_22 = to_19.pow(2) + to_19 = None + variance_16 = pow_22.mean(-1, keepdim=True) + pow_22 = None + add_53 = variance_16 + 1e-06 + variance_16 = None + rsqrt_16 = torch.rsqrt(add_53) + add_53 = None + hidden_states_43 = hidden_states_42 * rsqrt_16 + rsqrt_16 = None + normed_hidden_states_11 = ( + l_self_modules_block_modules_5_modules_layer_modules_1_modules_layer_norm_parameters_weight_ + * hidden_states_43 + ) + l_self_modules_block_modules_5_modules_layer_modules_1_modules_layer_norm_parameters_weight_ = ( + hidden_states_43 + ) = None + query_states_22 = torch._C._nn.linear( + normed_hidden_states_11, + l_self_modules_block_modules_5_modules_layer_modules_1_modules_enc_dec_attention_modules_q_parameters_weight_, + None, + ) + normed_hidden_states_11 = l_self_modules_block_modules_5_modules_layer_modules_1_modules_enc_dec_attention_modules_q_parameters_weight_ = (None) + view_44 = query_states_22.view(1, -1, 6, 64) + query_states_22 = None + query_states_23 = view_44.transpose(1, 2) + view_44 = None + key_states_22 = torch._C._nn.linear( + l_encoder_hidden_states_, + l_self_modules_block_modules_5_modules_layer_modules_1_modules_enc_dec_attention_modules_k_parameters_weight_, + None, + ) + l_self_modules_block_modules_5_modules_layer_modules_1_modules_enc_dec_attention_modules_k_parameters_weight_ = ( + None + ) + value_states_22 = torch._C._nn.linear( + l_encoder_hidden_states_, + l_self_modules_block_modules_5_modules_layer_modules_1_modules_enc_dec_attention_modules_v_parameters_weight_, + None, + ) + l_self_modules_block_modules_5_modules_layer_modules_1_modules_enc_dec_attention_modules_v_parameters_weight_ = ( + None + ) + view_45 = key_states_22.view(1, -1, 6, 64) + key_states_22 = None + key_states_23 = view_45.transpose(1, 2) + view_45 = None + view_46 = value_states_22.view(1, -1, 6, 64) + value_states_22 = None + value_states_23 = view_46.transpose(1, 2) + view_46 = None + transpose_58 = key_states_23.transpose(3, 2) + scores_22 = torch.matmul(query_states_23, transpose_58) + query_states_23 = transpose_58 = None + scores_22 += position_bias_3 + scores_23 = scores_22 + scores_22 = None + float_13 = scores_23.float() + softmax_11 = torch.nn.functional.softmax(float_13, dim=-1) + float_13 = None + attn_weights_22 = softmax_11.type_as(scores_23) + softmax_11 = scores_23 = None + attn_weights_23 = torch.nn.functional.dropout( + attn_weights_22, p=0.1, training=False + ) + attn_weights_22 = None + attn_output_44 = torch.matmul(attn_weights_23, value_states_23) + attn_weights_23 = None + transpose_59 = attn_output_44.transpose(1, 2) + attn_output_44 = None + attn_output_45 = transpose_59.contiguous() + transpose_59 = None + attn_output_46 = attn_output_45.view(1, -1, 384) + attn_output_45 = None + attn_output_47 = torch._C._nn.linear( + attn_output_46, + l_self_modules_block_modules_5_modules_layer_modules_1_modules_enc_dec_attention_modules_o_parameters_weight_, + None, + ) + attn_output_46 = l_self_modules_block_modules_5_modules_layer_modules_1_modules_enc_dec_attention_modules_o_parameters_weight_ = (None) + dropout_34 = torch.nn.functional.dropout(attn_output_47, 0.1, False, False) + attn_output_47 = None + layer_output_5 = hidden_states_42 + dropout_34 + hidden_states_42 = dropout_34 = None + to_20 = layer_output_5.to(torch.float32) + pow_23 = to_20.pow(2) + to_20 = None + variance_17 = pow_23.mean(-1, keepdim=True) + pow_23 = None + add_55 = variance_17 + 1e-06 + variance_17 = None + rsqrt_17 = torch.rsqrt(add_55) + add_55 = None + hidden_states_44 = layer_output_5 * rsqrt_17 + rsqrt_17 = None + forwarded_states_5 = ( + l_self_modules_block_modules_5_modules_layer_modules_2_modules_layer_norm_parameters_weight_ + * hidden_states_44 + ) + l_self_modules_block_modules_5_modules_layer_modules_2_modules_layer_norm_parameters_weight_ = ( + hidden_states_44 + ) = None + linear_63 = torch._C._nn.linear( + forwarded_states_5, + l_self_modules_block_modules_5_modules_layer_modules_2_modules_dense_relu_dense_modules_wi_0_parameters_weight_, + None, + ) + l_self_modules_block_modules_5_modules_layer_modules_2_modules_dense_relu_dense_modules_wi_0_parameters_weight_ = ( + None + ) + mul_63 = 0.5 * linear_63 + pow_24 = torch.pow(linear_63, 3.0) + mul_64 = 0.044715 * pow_24 + pow_24 = None + add_56 = linear_63 + mul_64 + linear_63 = mul_64 = None + mul_65 = 0.7978845608028654 * add_56 + add_56 = None + tanh_5 = torch.tanh(mul_65) + mul_65 = None + add_57 = 1.0 + tanh_5 + tanh_5 = None + hidden_gelu_5 = mul_63 * add_57 + mul_63 = add_57 = None + hidden_linear_5 = torch._C._nn.linear( + forwarded_states_5, + l_self_modules_block_modules_5_modules_layer_modules_2_modules_dense_relu_dense_modules_wi_1_parameters_weight_, + None, + ) + forwarded_states_5 = l_self_modules_block_modules_5_modules_layer_modules_2_modules_dense_relu_dense_modules_wi_1_parameters_weight_ = (None) + hidden_states_45 = hidden_gelu_5 * hidden_linear_5 + hidden_gelu_5 = hidden_linear_5 = None + hidden_states_46 = torch.nn.functional.dropout( + hidden_states_45, 0.1, False, False + ) + hidden_states_45 = None + hidden_states_47 = torch._C._nn.linear( + hidden_states_46, + l_self_modules_block_modules_5_modules_layer_modules_2_modules_dense_relu_dense_modules_wo_parameters_weight_, + None, + ) + hidden_states_46 = l_self_modules_block_modules_5_modules_layer_modules_2_modules_dense_relu_dense_modules_wo_parameters_weight_ = (None) + dropout_36 = torch.nn.functional.dropout(hidden_states_47, 0.1, False, False) + hidden_states_47 = None + hidden_states_48 = layer_output_5 + dropout_36 + layer_output_5 = dropout_36 = None + to_21 = hidden_states_48.to(torch.float32) + pow_25 = to_21.pow(2) + to_21 = None + variance_18 = pow_25.mean(-1, keepdim=True) + pow_25 = None + add_59 = variance_18 + 1e-06 + variance_18 = None + rsqrt_18 = torch.rsqrt(add_59) + add_59 = None + hidden_states_49 = hidden_states_48 * rsqrt_18 + rsqrt_18 = None + normed_hidden_states_12 = ( + l_self_modules_block_modules_6_modules_layer_modules_0_modules_layer_norm_parameters_weight_ + * hidden_states_49 + ) + l_self_modules_block_modules_6_modules_layer_modules_0_modules_layer_norm_parameters_weight_ = ( + hidden_states_49 + ) = None + query_states_24 = torch._C._nn.linear( + normed_hidden_states_12, + l_self_modules_block_modules_6_modules_layer_modules_0_modules_self_attention_modules_q_parameters_weight_, + None, + ) + l_self_modules_block_modules_6_modules_layer_modules_0_modules_self_attention_modules_q_parameters_weight_ = ( + None + ) + view_48 = query_states_24.view(1, -1, 6, 64) + query_states_24 = None + query_states_25 = view_48.transpose(1, 2) + view_48 = None + key_states_24 = torch._C._nn.linear( + normed_hidden_states_12, + l_self_modules_block_modules_6_modules_layer_modules_0_modules_self_attention_modules_k_parameters_weight_, + None, + ) + l_self_modules_block_modules_6_modules_layer_modules_0_modules_self_attention_modules_k_parameters_weight_ = ( + None + ) + value_states_24 = torch._C._nn.linear( + normed_hidden_states_12, + l_self_modules_block_modules_6_modules_layer_modules_0_modules_self_attention_modules_v_parameters_weight_, + None, + ) + normed_hidden_states_12 = l_self_modules_block_modules_6_modules_layer_modules_0_modules_self_attention_modules_v_parameters_weight_ = (None) + view_49 = key_states_24.view(1, -1, 6, 64) + key_states_24 = None + key_states_25 = view_49.transpose(1, 2) + view_49 = None + view_50 = value_states_24.view(1, -1, 6, 64) + value_states_24 = None + value_states_25 = view_50.transpose(1, 2) + view_50 = None + transpose_63 = key_states_25.transpose(3, 2) + scores_24 = torch.matmul(query_states_25, transpose_63) + query_states_25 = transpose_63 = None + scores_24 += position_bias_1 + scores_25 = scores_24 + scores_24 = None + float_14 = scores_25.float() + softmax_12 = torch.nn.functional.softmax(float_14, dim=-1) + float_14 = None + attn_weights_24 = softmax_12.type_as(scores_25) + softmax_12 = scores_25 = None + attn_weights_25 = torch.nn.functional.dropout( + attn_weights_24, p=0.1, training=False + ) + attn_weights_24 = None + attn_output_48 = torch.matmul(attn_weights_25, value_states_25) + attn_weights_25 = None + transpose_64 = attn_output_48.transpose(1, 2) + attn_output_48 = None + attn_output_49 = transpose_64.contiguous() + transpose_64 = None + attn_output_50 = attn_output_49.view(1, -1, 384) + attn_output_49 = None + attn_output_51 = torch._C._nn.linear( + attn_output_50, + l_self_modules_block_modules_6_modules_layer_modules_0_modules_self_attention_modules_o_parameters_weight_, + None, + ) + attn_output_50 = l_self_modules_block_modules_6_modules_layer_modules_0_modules_self_attention_modules_o_parameters_weight_ = (None) + dropout_38 = torch.nn.functional.dropout(attn_output_51, 0.1, False, False) + attn_output_51 = None + hidden_states_50 = hidden_states_48 + dropout_38 + hidden_states_48 = dropout_38 = None + getitem_14 = cache_position[-1] + add_61 = getitem_14 + 1 + getitem_14 = add_61 = None + to_22 = hidden_states_50.to(torch.float32) + pow_26 = to_22.pow(2) + to_22 = None + variance_19 = pow_26.mean(-1, keepdim=True) + pow_26 = None + add_62 = variance_19 + 1e-06 + variance_19 = None + rsqrt_19 = torch.rsqrt(add_62) + add_62 = None + hidden_states_51 = hidden_states_50 * rsqrt_19 + rsqrt_19 = None + normed_hidden_states_13 = ( + l_self_modules_block_modules_6_modules_layer_modules_1_modules_layer_norm_parameters_weight_ + * hidden_states_51 + ) + l_self_modules_block_modules_6_modules_layer_modules_1_modules_layer_norm_parameters_weight_ = ( + hidden_states_51 + ) = None + query_states_26 = torch._C._nn.linear( + normed_hidden_states_13, + l_self_modules_block_modules_6_modules_layer_modules_1_modules_enc_dec_attention_modules_q_parameters_weight_, + None, + ) + normed_hidden_states_13 = l_self_modules_block_modules_6_modules_layer_modules_1_modules_enc_dec_attention_modules_q_parameters_weight_ = (None) + view_52 = query_states_26.view(1, -1, 6, 64) + query_states_26 = None + query_states_27 = view_52.transpose(1, 2) + view_52 = None + key_states_26 = torch._C._nn.linear( + l_encoder_hidden_states_, + l_self_modules_block_modules_6_modules_layer_modules_1_modules_enc_dec_attention_modules_k_parameters_weight_, + None, + ) + l_self_modules_block_modules_6_modules_layer_modules_1_modules_enc_dec_attention_modules_k_parameters_weight_ = ( + None + ) + value_states_26 = torch._C._nn.linear( + l_encoder_hidden_states_, + l_self_modules_block_modules_6_modules_layer_modules_1_modules_enc_dec_attention_modules_v_parameters_weight_, + None, + ) + l_self_modules_block_modules_6_modules_layer_modules_1_modules_enc_dec_attention_modules_v_parameters_weight_ = ( + None + ) + view_53 = key_states_26.view(1, -1, 6, 64) + key_states_26 = None + key_states_27 = view_53.transpose(1, 2) + view_53 = None + view_54 = value_states_26.view(1, -1, 6, 64) + value_states_26 = None + value_states_27 = view_54.transpose(1, 2) + view_54 = None + transpose_68 = key_states_27.transpose(3, 2) + scores_26 = torch.matmul(query_states_27, transpose_68) + query_states_27 = transpose_68 = None + scores_26 += position_bias_3 + scores_27 = scores_26 + scores_26 = None + float_15 = scores_27.float() + softmax_13 = torch.nn.functional.softmax(float_15, dim=-1) + float_15 = None + attn_weights_26 = softmax_13.type_as(scores_27) + softmax_13 = scores_27 = None + attn_weights_27 = torch.nn.functional.dropout( + attn_weights_26, p=0.1, training=False + ) + attn_weights_26 = None + attn_output_52 = torch.matmul(attn_weights_27, value_states_27) + attn_weights_27 = None + transpose_69 = attn_output_52.transpose(1, 2) + attn_output_52 = None + attn_output_53 = transpose_69.contiguous() + transpose_69 = None + attn_output_54 = attn_output_53.view(1, -1, 384) + attn_output_53 = None + attn_output_55 = torch._C._nn.linear( + attn_output_54, + l_self_modules_block_modules_6_modules_layer_modules_1_modules_enc_dec_attention_modules_o_parameters_weight_, + None, + ) + attn_output_54 = l_self_modules_block_modules_6_modules_layer_modules_1_modules_enc_dec_attention_modules_o_parameters_weight_ = (None) + dropout_40 = torch.nn.functional.dropout(attn_output_55, 0.1, False, False) + attn_output_55 = None + layer_output_6 = hidden_states_50 + dropout_40 + hidden_states_50 = dropout_40 = None + to_23 = layer_output_6.to(torch.float32) + pow_27 = to_23.pow(2) + to_23 = None + variance_20 = pow_27.mean(-1, keepdim=True) + pow_27 = None + add_64 = variance_20 + 1e-06 + variance_20 = None + rsqrt_20 = torch.rsqrt(add_64) + add_64 = None + hidden_states_52 = layer_output_6 * rsqrt_20 + rsqrt_20 = None + forwarded_states_6 = ( + l_self_modules_block_modules_6_modules_layer_modules_2_modules_layer_norm_parameters_weight_ + * hidden_states_52 + ) + l_self_modules_block_modules_6_modules_layer_modules_2_modules_layer_norm_parameters_weight_ = ( + hidden_states_52 + ) = None + linear_74 = torch._C._nn.linear( + forwarded_states_6, + l_self_modules_block_modules_6_modules_layer_modules_2_modules_dense_relu_dense_modules_wi_0_parameters_weight_, + None, + ) + l_self_modules_block_modules_6_modules_layer_modules_2_modules_dense_relu_dense_modules_wi_0_parameters_weight_ = ( + None + ) + mul_74 = 0.5 * linear_74 + pow_28 = torch.pow(linear_74, 3.0) + mul_75 = 0.044715 * pow_28 + pow_28 = None + add_65 = linear_74 + mul_75 + linear_74 = mul_75 = None + mul_76 = 0.7978845608028654 * add_65 + add_65 = None + tanh_6 = torch.tanh(mul_76) + mul_76 = None + add_66 = 1.0 + tanh_6 + tanh_6 = None + hidden_gelu_6 = mul_74 * add_66 + mul_74 = add_66 = None + hidden_linear_6 = torch._C._nn.linear( + forwarded_states_6, + l_self_modules_block_modules_6_modules_layer_modules_2_modules_dense_relu_dense_modules_wi_1_parameters_weight_, + None, + ) + forwarded_states_6 = l_self_modules_block_modules_6_modules_layer_modules_2_modules_dense_relu_dense_modules_wi_1_parameters_weight_ = (None) + hidden_states_53 = hidden_gelu_6 * hidden_linear_6 + hidden_gelu_6 = hidden_linear_6 = None + hidden_states_54 = torch.nn.functional.dropout( + hidden_states_53, 0.1, False, False + ) + hidden_states_53 = None + hidden_states_55 = torch._C._nn.linear( + hidden_states_54, + l_self_modules_block_modules_6_modules_layer_modules_2_modules_dense_relu_dense_modules_wo_parameters_weight_, + None, + ) + hidden_states_54 = l_self_modules_block_modules_6_modules_layer_modules_2_modules_dense_relu_dense_modules_wo_parameters_weight_ = (None) + dropout_42 = torch.nn.functional.dropout(hidden_states_55, 0.1, False, False) + hidden_states_55 = None + hidden_states_56 = layer_output_6 + dropout_42 + layer_output_6 = dropout_42 = None + to_24 = hidden_states_56.to(torch.float32) + pow_29 = to_24.pow(2) + to_24 = None + variance_21 = pow_29.mean(-1, keepdim=True) + pow_29 = None + add_68 = variance_21 + 1e-06 + variance_21 = None + rsqrt_21 = torch.rsqrt(add_68) + add_68 = None + hidden_states_57 = hidden_states_56 * rsqrt_21 + rsqrt_21 = None + normed_hidden_states_14 = ( + l_self_modules_block_modules_7_modules_layer_modules_0_modules_layer_norm_parameters_weight_ + * hidden_states_57 + ) + l_self_modules_block_modules_7_modules_layer_modules_0_modules_layer_norm_parameters_weight_ = ( + hidden_states_57 + ) = None + query_states_28 = torch._C._nn.linear( + normed_hidden_states_14, + l_self_modules_block_modules_7_modules_layer_modules_0_modules_self_attention_modules_q_parameters_weight_, + None, + ) + l_self_modules_block_modules_7_modules_layer_modules_0_modules_self_attention_modules_q_parameters_weight_ = ( + None + ) + view_56 = query_states_28.view(1, -1, 6, 64) + query_states_28 = None + query_states_29 = view_56.transpose(1, 2) + view_56 = None + key_states_28 = torch._C._nn.linear( + normed_hidden_states_14, + l_self_modules_block_modules_7_modules_layer_modules_0_modules_self_attention_modules_k_parameters_weight_, + None, + ) + l_self_modules_block_modules_7_modules_layer_modules_0_modules_self_attention_modules_k_parameters_weight_ = ( + None + ) + value_states_28 = torch._C._nn.linear( + normed_hidden_states_14, + l_self_modules_block_modules_7_modules_layer_modules_0_modules_self_attention_modules_v_parameters_weight_, + None, + ) + normed_hidden_states_14 = l_self_modules_block_modules_7_modules_layer_modules_0_modules_self_attention_modules_v_parameters_weight_ = (None) + view_57 = key_states_28.view(1, -1, 6, 64) + key_states_28 = None + key_states_29 = view_57.transpose(1, 2) + view_57 = None + view_58 = value_states_28.view(1, -1, 6, 64) + value_states_28 = None + value_states_29 = view_58.transpose(1, 2) + view_58 = None + transpose_73 = key_states_29.transpose(3, 2) + scores_28 = torch.matmul(query_states_29, transpose_73) + query_states_29 = transpose_73 = None + scores_28 += position_bias_1 + scores_29 = scores_28 + scores_28 = position_bias_1 = None + float_16 = scores_29.float() + softmax_14 = torch.nn.functional.softmax(float_16, dim=-1) + float_16 = None + attn_weights_28 = softmax_14.type_as(scores_29) + softmax_14 = scores_29 = None + attn_weights_29 = torch.nn.functional.dropout( + attn_weights_28, p=0.1, training=False + ) + attn_weights_28 = None + attn_output_56 = torch.matmul(attn_weights_29, value_states_29) + attn_weights_29 = None + transpose_74 = attn_output_56.transpose(1, 2) + attn_output_56 = None + attn_output_57 = transpose_74.contiguous() + transpose_74 = None + attn_output_58 = attn_output_57.view(1, -1, 384) + attn_output_57 = None + attn_output_59 = torch._C._nn.linear( + attn_output_58, + l_self_modules_block_modules_7_modules_layer_modules_0_modules_self_attention_modules_o_parameters_weight_, + None, + ) + attn_output_58 = l_self_modules_block_modules_7_modules_layer_modules_0_modules_self_attention_modules_o_parameters_weight_ = (None) + dropout_44 = torch.nn.functional.dropout(attn_output_59, 0.1, False, False) + attn_output_59 = None + hidden_states_58 = hidden_states_56 + dropout_44 + hidden_states_56 = dropout_44 = None + getitem_15 = cache_position[-1] + cache_position = None + add_70 = getitem_15 + 1 + getitem_15 = add_70 = None + to_25 = hidden_states_58.to(torch.float32) + pow_30 = to_25.pow(2) + to_25 = None + variance_22 = pow_30.mean(-1, keepdim=True) + pow_30 = None + add_71 = variance_22 + 1e-06 + variance_22 = None + rsqrt_22 = torch.rsqrt(add_71) + add_71 = None + hidden_states_59 = hidden_states_58 * rsqrt_22 + rsqrt_22 = None + normed_hidden_states_15 = ( + l_self_modules_block_modules_7_modules_layer_modules_1_modules_layer_norm_parameters_weight_ + * hidden_states_59 + ) + l_self_modules_block_modules_7_modules_layer_modules_1_modules_layer_norm_parameters_weight_ = ( + hidden_states_59 + ) = None + query_states_30 = torch._C._nn.linear( + normed_hidden_states_15, + l_self_modules_block_modules_7_modules_layer_modules_1_modules_enc_dec_attention_modules_q_parameters_weight_, + None, + ) + normed_hidden_states_15 = l_self_modules_block_modules_7_modules_layer_modules_1_modules_enc_dec_attention_modules_q_parameters_weight_ = (None) + view_60 = query_states_30.view(1, -1, 6, 64) + query_states_30 = None + query_states_31 = view_60.transpose(1, 2) + view_60 = None + key_states_30 = torch._C._nn.linear( + l_encoder_hidden_states_, + l_self_modules_block_modules_7_modules_layer_modules_1_modules_enc_dec_attention_modules_k_parameters_weight_, + None, + ) + l_self_modules_block_modules_7_modules_layer_modules_1_modules_enc_dec_attention_modules_k_parameters_weight_ = ( + None + ) + value_states_30 = torch._C._nn.linear( + l_encoder_hidden_states_, + l_self_modules_block_modules_7_modules_layer_modules_1_modules_enc_dec_attention_modules_v_parameters_weight_, + None, + ) + l_encoder_hidden_states_ = l_self_modules_block_modules_7_modules_layer_modules_1_modules_enc_dec_attention_modules_v_parameters_weight_ = (None) + view_61 = key_states_30.view(1, -1, 6, 64) + key_states_30 = None + key_states_31 = view_61.transpose(1, 2) + view_61 = None + view_62 = value_states_30.view(1, -1, 6, 64) + value_states_30 = None + value_states_31 = view_62.transpose(1, 2) + view_62 = None + transpose_78 = key_states_31.transpose(3, 2) + scores_30 = torch.matmul(query_states_31, transpose_78) + query_states_31 = transpose_78 = None + scores_30 += position_bias_3 + scores_31 = scores_30 + scores_30 = position_bias_3 = None + float_17 = scores_31.float() + softmax_15 = torch.nn.functional.softmax(float_17, dim=-1) + float_17 = None + attn_weights_30 = softmax_15.type_as(scores_31) + softmax_15 = scores_31 = None + attn_weights_31 = torch.nn.functional.dropout( + attn_weights_30, p=0.1, training=False + ) + attn_weights_30 = None + attn_output_60 = torch.matmul(attn_weights_31, value_states_31) + attn_weights_31 = None + transpose_79 = attn_output_60.transpose(1, 2) + attn_output_60 = None + attn_output_61 = transpose_79.contiguous() + transpose_79 = None + attn_output_62 = attn_output_61.view(1, -1, 384) + attn_output_61 = None + attn_output_63 = torch._C._nn.linear( + attn_output_62, + l_self_modules_block_modules_7_modules_layer_modules_1_modules_enc_dec_attention_modules_o_parameters_weight_, + None, + ) + attn_output_62 = l_self_modules_block_modules_7_modules_layer_modules_1_modules_enc_dec_attention_modules_o_parameters_weight_ = (None) + dropout_46 = torch.nn.functional.dropout(attn_output_63, 0.1, False, False) + attn_output_63 = None + layer_output_7 = hidden_states_58 + dropout_46 + hidden_states_58 = dropout_46 = None + to_26 = layer_output_7.to(torch.float32) + pow_31 = to_26.pow(2) + to_26 = None + variance_23 = pow_31.mean(-1, keepdim=True) + pow_31 = None + add_73 = variance_23 + 1e-06 + variance_23 = None + rsqrt_23 = torch.rsqrt(add_73) + add_73 = None + hidden_states_60 = layer_output_7 * rsqrt_23 + rsqrt_23 = None + forwarded_states_7 = ( + l_self_modules_block_modules_7_modules_layer_modules_2_modules_layer_norm_parameters_weight_ + * hidden_states_60 + ) + l_self_modules_block_modules_7_modules_layer_modules_2_modules_layer_norm_parameters_weight_ = ( + hidden_states_60 + ) = None + linear_85 = torch._C._nn.linear( + forwarded_states_7, + l_self_modules_block_modules_7_modules_layer_modules_2_modules_dense_relu_dense_modules_wi_0_parameters_weight_, + None, + ) + l_self_modules_block_modules_7_modules_layer_modules_2_modules_dense_relu_dense_modules_wi_0_parameters_weight_ = ( + None + ) + mul_85 = 0.5 * linear_85 + pow_32 = torch.pow(linear_85, 3.0) + mul_86 = 0.044715 * pow_32 + pow_32 = None + add_74 = linear_85 + mul_86 + linear_85 = mul_86 = None + mul_87 = 0.7978845608028654 * add_74 + add_74 = None + tanh_7 = torch.tanh(mul_87) + mul_87 = None + add_75 = 1.0 + tanh_7 + tanh_7 = None + hidden_gelu_7 = mul_85 * add_75 + mul_85 = add_75 = None + hidden_linear_7 = torch._C._nn.linear( + forwarded_states_7, + l_self_modules_block_modules_7_modules_layer_modules_2_modules_dense_relu_dense_modules_wi_1_parameters_weight_, + None, + ) + forwarded_states_7 = l_self_modules_block_modules_7_modules_layer_modules_2_modules_dense_relu_dense_modules_wi_1_parameters_weight_ = (None) + hidden_states_61 = hidden_gelu_7 * hidden_linear_7 + hidden_gelu_7 = hidden_linear_7 = None + hidden_states_62 = torch.nn.functional.dropout( + hidden_states_61, 0.1, False, False + ) + hidden_states_61 = None + hidden_states_63 = torch._C._nn.linear( + hidden_states_62, + l_self_modules_block_modules_7_modules_layer_modules_2_modules_dense_relu_dense_modules_wo_parameters_weight_, + None, + ) + hidden_states_62 = l_self_modules_block_modules_7_modules_layer_modules_2_modules_dense_relu_dense_modules_wo_parameters_weight_ = (None) + dropout_48 = torch.nn.functional.dropout(hidden_states_63, 0.1, False, False) + hidden_states_63 = None + hidden_states_64 = layer_output_7 + dropout_48 + layer_output_7 = dropout_48 = None + to_27 = hidden_states_64.to(torch.float32) + pow_33 = to_27.pow(2) + to_27 = None + variance_24 = pow_33.mean(-1, keepdim=True) + pow_33 = None + add_77 = variance_24 + 1e-06 + variance_24 = None + rsqrt_24 = torch.rsqrt(add_77) + add_77 = None + hidden_states_65 = hidden_states_64 * rsqrt_24 + hidden_states_64 = rsqrt_24 = None + hidden_states_66 = ( + l_self_modules_final_layer_norm_parameters_weight_ * hidden_states_65 + ) + l_self_modules_final_layer_norm_parameters_weight_ = hidden_states_65 = None + hidden_states_67 = torch.nn.functional.dropout( + hidden_states_66, 0.1, False, False + ) + hidden_states_66 = None + return ( + value_states_1, + key_states_1, + value_states_3, + key_states_3, + value_states_5, + key_states_5, + value_states_7, + key_states_7, + value_states_9, + key_states_9, + value_states_11, + key_states_11, + value_states_13, + key_states_13, + value_states_15, + key_states_15, + value_states_17, + key_states_17, + value_states_19, + key_states_19, + value_states_21, + key_states_21, + value_states_23, + key_states_23, + value_states_25, + key_states_25, + value_states_27, + key_states_27, + value_states_29, + key_states_29, + value_states_31, + key_states_31, + hidden_states_67, + ) diff --git a/samples/transformers-auto-model/moska_plt5-seq-clf-with-entities-updated-finetuned/weight_meta.py b/samples/transformers-auto-model/moska_plt5-seq-clf-with-entities-updated-finetuned/weight_meta.py new file mode 100644 index 000000000..52181dea9 --- /dev/null +++ b/samples/transformers-auto-model/moska_plt5-seq-clf-with-entities-updated-finetuned/weight_meta.py @@ -0,0 +1,1168 @@ +class Program_weight_tensor_meta_L_inputs_embeds_: + name = "L_inputs_embeds_" + shape = [1, 22, 512] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.078 + std = 1.019 + data = None + + +class Program_weight_tensor_meta_L_encoder_hidden_states_: + name = "L_encoder_hidden_states_" + shape = [1, 22, 512] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.002 + std = 1.000 + data = None + + +class Program_weight_tensor_meta_L_encoder_attention_mask_: + name = "L_encoder_attention_mask_" + shape = [1, 22] + dtype = "torch.int64" + device = "cuda:0" + mean = None + std = None + data = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] + + +class Program_weight_tensor_meta_L_self_modules_block_modules_0_modules_layer_modules_0_modules_layer_norm_parameters_weight_: + name = "L_self_modules_block_modules_0_modules_layer_modules_0_modules_layer_norm_parameters_weight_" + shape = [512] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_block_modules_0_modules_layer_modules_0_modules_SelfAttention_modules_q_parameters_weight_: + name = "L_self_modules_block_modules_0_modules_layer_modules_0_modules_SelfAttention_modules_q_parameters_weight_" + shape = [384, 512] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.006 + data = None + + +class Program_weight_tensor_meta_L_self_modules_block_modules_0_modules_layer_modules_0_modules_SelfAttention_modules_k_parameters_weight_: + name = "L_self_modules_block_modules_0_modules_layer_modules_0_modules_SelfAttention_modules_k_parameters_weight_" + shape = [384, 512] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.044 + data = None + + +class Program_weight_tensor_meta_L_self_modules_block_modules_0_modules_layer_modules_0_modules_SelfAttention_modules_v_parameters_weight_: + name = "L_self_modules_block_modules_0_modules_layer_modules_0_modules_SelfAttention_modules_v_parameters_weight_" + shape = [384, 512] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.044 + data = None + + +class Program_weight_tensor_meta_L_self_modules_block_modules_0_modules_layer_modules_0_modules_SelfAttention_modules_relative_attention_bias_parameters_weight_: + name = "L_self_modules_block_modules_0_modules_layer_modules_0_modules_SelfAttention_modules_relative_attention_bias_parameters_weight_" + shape = [32, 6] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.003 + std = 0.045 + data = None + + +class Program_weight_tensor_meta_L_self_modules_block_modules_0_modules_layer_modules_0_modules_SelfAttention_modules_o_parameters_weight_: + name = "L_self_modules_block_modules_0_modules_layer_modules_0_modules_SelfAttention_modules_o_parameters_weight_" + shape = [512, 384] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.051 + data = None + + +class Program_weight_tensor_meta_L_self_modules_block_modules_0_modules_layer_modules_1_modules_layer_norm_parameters_weight_: + name = "L_self_modules_block_modules_0_modules_layer_modules_1_modules_layer_norm_parameters_weight_" + shape = [512] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_block_modules_0_modules_layer_modules_1_modules_EncDecAttention_modules_q_parameters_weight_: + name = "L_self_modules_block_modules_0_modules_layer_modules_1_modules_EncDecAttention_modules_q_parameters_weight_" + shape = [384, 512] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.006 + data = None + + +class Program_weight_tensor_meta_L_self_modules_block_modules_0_modules_layer_modules_1_modules_EncDecAttention_modules_k_parameters_weight_: + name = "L_self_modules_block_modules_0_modules_layer_modules_1_modules_EncDecAttention_modules_k_parameters_weight_" + shape = [384, 512] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.044 + data = None + + +class Program_weight_tensor_meta_L_self_modules_block_modules_0_modules_layer_modules_1_modules_EncDecAttention_modules_v_parameters_weight_: + name = "L_self_modules_block_modules_0_modules_layer_modules_1_modules_EncDecAttention_modules_v_parameters_weight_" + shape = [384, 512] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.044 + data = None + + +class Program_weight_tensor_meta_L_self_modules_block_modules_0_modules_layer_modules_1_modules_EncDecAttention_modules_o_parameters_weight_: + name = "L_self_modules_block_modules_0_modules_layer_modules_1_modules_EncDecAttention_modules_o_parameters_weight_" + shape = [512, 384] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.051 + data = None + + +class Program_weight_tensor_meta_L_self_modules_block_modules_0_modules_layer_modules_2_modules_layer_norm_parameters_weight_: + name = "L_self_modules_block_modules_0_modules_layer_modules_2_modules_layer_norm_parameters_weight_" + shape = [512] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_block_modules_0_modules_layer_modules_2_modules_DenseReluDense_modules_wi_0_parameters_weight_: + name = "L_self_modules_block_modules_0_modules_layer_modules_2_modules_DenseReluDense_modules_wi_0_parameters_weight_" + shape = [1024, 512] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.044 + data = None + + +class Program_weight_tensor_meta_L_self_modules_block_modules_0_modules_layer_modules_2_modules_DenseReluDense_modules_wi_1_parameters_weight_: + name = "L_self_modules_block_modules_0_modules_layer_modules_2_modules_DenseReluDense_modules_wi_1_parameters_weight_" + shape = [1024, 512] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.044 + data = None + + +class Program_weight_tensor_meta_L_self_modules_block_modules_0_modules_layer_modules_2_modules_DenseReluDense_modules_wo_parameters_weight_: + name = "L_self_modules_block_modules_0_modules_layer_modules_2_modules_DenseReluDense_modules_wo_parameters_weight_" + shape = [512, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.031 + data = None + + +class Program_weight_tensor_meta_L_self_modules_block_modules_1_modules_layer_modules_0_modules_layer_norm_parameters_weight_: + name = "L_self_modules_block_modules_1_modules_layer_modules_0_modules_layer_norm_parameters_weight_" + shape = [512] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_block_modules_1_modules_layer_modules_0_modules_SelfAttention_modules_q_parameters_weight_: + name = "L_self_modules_block_modules_1_modules_layer_modules_0_modules_SelfAttention_modules_q_parameters_weight_" + shape = [384, 512] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.006 + data = None + + +class Program_weight_tensor_meta_L_self_modules_block_modules_1_modules_layer_modules_0_modules_SelfAttention_modules_k_parameters_weight_: + name = "L_self_modules_block_modules_1_modules_layer_modules_0_modules_SelfAttention_modules_k_parameters_weight_" + shape = [384, 512] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.044 + data = None + + +class Program_weight_tensor_meta_L_self_modules_block_modules_1_modules_layer_modules_0_modules_SelfAttention_modules_v_parameters_weight_: + name = "L_self_modules_block_modules_1_modules_layer_modules_0_modules_SelfAttention_modules_v_parameters_weight_" + shape = [384, 512] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.044 + data = None + + +class Program_weight_tensor_meta_L_self_modules_block_modules_1_modules_layer_modules_0_modules_SelfAttention_modules_o_parameters_weight_: + name = "L_self_modules_block_modules_1_modules_layer_modules_0_modules_SelfAttention_modules_o_parameters_weight_" + shape = [512, 384] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.051 + data = None + + +class Program_weight_tensor_meta_L_self_modules_block_modules_1_modules_layer_modules_1_modules_layer_norm_parameters_weight_: + name = "L_self_modules_block_modules_1_modules_layer_modules_1_modules_layer_norm_parameters_weight_" + shape = [512] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_block_modules_1_modules_layer_modules_1_modules_EncDecAttention_modules_q_parameters_weight_: + name = "L_self_modules_block_modules_1_modules_layer_modules_1_modules_EncDecAttention_modules_q_parameters_weight_" + shape = [384, 512] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.006 + data = None + + +class Program_weight_tensor_meta_L_self_modules_block_modules_1_modules_layer_modules_1_modules_EncDecAttention_modules_k_parameters_weight_: + name = "L_self_modules_block_modules_1_modules_layer_modules_1_modules_EncDecAttention_modules_k_parameters_weight_" + shape = [384, 512] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.044 + data = None + + +class Program_weight_tensor_meta_L_self_modules_block_modules_1_modules_layer_modules_1_modules_EncDecAttention_modules_v_parameters_weight_: + name = "L_self_modules_block_modules_1_modules_layer_modules_1_modules_EncDecAttention_modules_v_parameters_weight_" + shape = [384, 512] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.044 + data = None + + +class Program_weight_tensor_meta_L_self_modules_block_modules_1_modules_layer_modules_1_modules_EncDecAttention_modules_o_parameters_weight_: + name = "L_self_modules_block_modules_1_modules_layer_modules_1_modules_EncDecAttention_modules_o_parameters_weight_" + shape = [512, 384] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.051 + data = None + + +class Program_weight_tensor_meta_L_self_modules_block_modules_1_modules_layer_modules_2_modules_layer_norm_parameters_weight_: + name = "L_self_modules_block_modules_1_modules_layer_modules_2_modules_layer_norm_parameters_weight_" + shape = [512] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_block_modules_1_modules_layer_modules_2_modules_DenseReluDense_modules_wi_0_parameters_weight_: + name = "L_self_modules_block_modules_1_modules_layer_modules_2_modules_DenseReluDense_modules_wi_0_parameters_weight_" + shape = [1024, 512] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.044 + data = None + + +class Program_weight_tensor_meta_L_self_modules_block_modules_1_modules_layer_modules_2_modules_DenseReluDense_modules_wi_1_parameters_weight_: + name = "L_self_modules_block_modules_1_modules_layer_modules_2_modules_DenseReluDense_modules_wi_1_parameters_weight_" + shape = [1024, 512] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.044 + data = None + + +class Program_weight_tensor_meta_L_self_modules_block_modules_1_modules_layer_modules_2_modules_DenseReluDense_modules_wo_parameters_weight_: + name = "L_self_modules_block_modules_1_modules_layer_modules_2_modules_DenseReluDense_modules_wo_parameters_weight_" + shape = [512, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.031 + data = None + + +class Program_weight_tensor_meta_L_self_modules_block_modules_2_modules_layer_modules_0_modules_layer_norm_parameters_weight_: + name = "L_self_modules_block_modules_2_modules_layer_modules_0_modules_layer_norm_parameters_weight_" + shape = [512] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_block_modules_2_modules_layer_modules_0_modules_SelfAttention_modules_q_parameters_weight_: + name = "L_self_modules_block_modules_2_modules_layer_modules_0_modules_SelfAttention_modules_q_parameters_weight_" + shape = [384, 512] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.006 + data = None + + +class Program_weight_tensor_meta_L_self_modules_block_modules_2_modules_layer_modules_0_modules_SelfAttention_modules_k_parameters_weight_: + name = "L_self_modules_block_modules_2_modules_layer_modules_0_modules_SelfAttention_modules_k_parameters_weight_" + shape = [384, 512] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.044 + data = None + + +class Program_weight_tensor_meta_L_self_modules_block_modules_2_modules_layer_modules_0_modules_SelfAttention_modules_v_parameters_weight_: + name = "L_self_modules_block_modules_2_modules_layer_modules_0_modules_SelfAttention_modules_v_parameters_weight_" + shape = [384, 512] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.044 + data = None + + +class Program_weight_tensor_meta_L_self_modules_block_modules_2_modules_layer_modules_0_modules_SelfAttention_modules_o_parameters_weight_: + name = "L_self_modules_block_modules_2_modules_layer_modules_0_modules_SelfAttention_modules_o_parameters_weight_" + shape = [512, 384] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.051 + data = None + + +class Program_weight_tensor_meta_L_self_modules_block_modules_2_modules_layer_modules_1_modules_layer_norm_parameters_weight_: + name = "L_self_modules_block_modules_2_modules_layer_modules_1_modules_layer_norm_parameters_weight_" + shape = [512] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_block_modules_2_modules_layer_modules_1_modules_EncDecAttention_modules_q_parameters_weight_: + name = "L_self_modules_block_modules_2_modules_layer_modules_1_modules_EncDecAttention_modules_q_parameters_weight_" + shape = [384, 512] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.006 + data = None + + +class Program_weight_tensor_meta_L_self_modules_block_modules_2_modules_layer_modules_1_modules_EncDecAttention_modules_k_parameters_weight_: + name = "L_self_modules_block_modules_2_modules_layer_modules_1_modules_EncDecAttention_modules_k_parameters_weight_" + shape = [384, 512] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.044 + data = None + + +class Program_weight_tensor_meta_L_self_modules_block_modules_2_modules_layer_modules_1_modules_EncDecAttention_modules_v_parameters_weight_: + name = "L_self_modules_block_modules_2_modules_layer_modules_1_modules_EncDecAttention_modules_v_parameters_weight_" + shape = [384, 512] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.044 + data = None + + +class Program_weight_tensor_meta_L_self_modules_block_modules_2_modules_layer_modules_1_modules_EncDecAttention_modules_o_parameters_weight_: + name = "L_self_modules_block_modules_2_modules_layer_modules_1_modules_EncDecAttention_modules_o_parameters_weight_" + shape = [512, 384] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.051 + data = None + + +class Program_weight_tensor_meta_L_self_modules_block_modules_2_modules_layer_modules_2_modules_layer_norm_parameters_weight_: + name = "L_self_modules_block_modules_2_modules_layer_modules_2_modules_layer_norm_parameters_weight_" + shape = [512] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_block_modules_2_modules_layer_modules_2_modules_DenseReluDense_modules_wi_0_parameters_weight_: + name = "L_self_modules_block_modules_2_modules_layer_modules_2_modules_DenseReluDense_modules_wi_0_parameters_weight_" + shape = [1024, 512] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.044 + data = None + + +class Program_weight_tensor_meta_L_self_modules_block_modules_2_modules_layer_modules_2_modules_DenseReluDense_modules_wi_1_parameters_weight_: + name = "L_self_modules_block_modules_2_modules_layer_modules_2_modules_DenseReluDense_modules_wi_1_parameters_weight_" + shape = [1024, 512] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.044 + data = None + + +class Program_weight_tensor_meta_L_self_modules_block_modules_2_modules_layer_modules_2_modules_DenseReluDense_modules_wo_parameters_weight_: + name = "L_self_modules_block_modules_2_modules_layer_modules_2_modules_DenseReluDense_modules_wo_parameters_weight_" + shape = [512, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.031 + data = None + + +class Program_weight_tensor_meta_L_self_modules_block_modules_3_modules_layer_modules_0_modules_layer_norm_parameters_weight_: + name = "L_self_modules_block_modules_3_modules_layer_modules_0_modules_layer_norm_parameters_weight_" + shape = [512] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_block_modules_3_modules_layer_modules_0_modules_SelfAttention_modules_q_parameters_weight_: + name = "L_self_modules_block_modules_3_modules_layer_modules_0_modules_SelfAttention_modules_q_parameters_weight_" + shape = [384, 512] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.006 + data = None + + +class Program_weight_tensor_meta_L_self_modules_block_modules_3_modules_layer_modules_0_modules_SelfAttention_modules_k_parameters_weight_: + name = "L_self_modules_block_modules_3_modules_layer_modules_0_modules_SelfAttention_modules_k_parameters_weight_" + shape = [384, 512] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.044 + data = None + + +class Program_weight_tensor_meta_L_self_modules_block_modules_3_modules_layer_modules_0_modules_SelfAttention_modules_v_parameters_weight_: + name = "L_self_modules_block_modules_3_modules_layer_modules_0_modules_SelfAttention_modules_v_parameters_weight_" + shape = [384, 512] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.044 + data = None + + +class Program_weight_tensor_meta_L_self_modules_block_modules_3_modules_layer_modules_0_modules_SelfAttention_modules_o_parameters_weight_: + name = "L_self_modules_block_modules_3_modules_layer_modules_0_modules_SelfAttention_modules_o_parameters_weight_" + shape = [512, 384] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.051 + data = None + + +class Program_weight_tensor_meta_L_self_modules_block_modules_3_modules_layer_modules_1_modules_layer_norm_parameters_weight_: + name = "L_self_modules_block_modules_3_modules_layer_modules_1_modules_layer_norm_parameters_weight_" + shape = [512] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_block_modules_3_modules_layer_modules_1_modules_EncDecAttention_modules_q_parameters_weight_: + name = "L_self_modules_block_modules_3_modules_layer_modules_1_modules_EncDecAttention_modules_q_parameters_weight_" + shape = [384, 512] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.006 + data = None + + +class Program_weight_tensor_meta_L_self_modules_block_modules_3_modules_layer_modules_1_modules_EncDecAttention_modules_k_parameters_weight_: + name = "L_self_modules_block_modules_3_modules_layer_modules_1_modules_EncDecAttention_modules_k_parameters_weight_" + shape = [384, 512] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.044 + data = None + + +class Program_weight_tensor_meta_L_self_modules_block_modules_3_modules_layer_modules_1_modules_EncDecAttention_modules_v_parameters_weight_: + name = "L_self_modules_block_modules_3_modules_layer_modules_1_modules_EncDecAttention_modules_v_parameters_weight_" + shape = [384, 512] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.044 + data = None + + +class Program_weight_tensor_meta_L_self_modules_block_modules_3_modules_layer_modules_1_modules_EncDecAttention_modules_o_parameters_weight_: + name = "L_self_modules_block_modules_3_modules_layer_modules_1_modules_EncDecAttention_modules_o_parameters_weight_" + shape = [512, 384] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.051 + data = None + + +class Program_weight_tensor_meta_L_self_modules_block_modules_3_modules_layer_modules_2_modules_layer_norm_parameters_weight_: + name = "L_self_modules_block_modules_3_modules_layer_modules_2_modules_layer_norm_parameters_weight_" + shape = [512] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_block_modules_3_modules_layer_modules_2_modules_DenseReluDense_modules_wi_0_parameters_weight_: + name = "L_self_modules_block_modules_3_modules_layer_modules_2_modules_DenseReluDense_modules_wi_0_parameters_weight_" + shape = [1024, 512] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.044 + data = None + + +class Program_weight_tensor_meta_L_self_modules_block_modules_3_modules_layer_modules_2_modules_DenseReluDense_modules_wi_1_parameters_weight_: + name = "L_self_modules_block_modules_3_modules_layer_modules_2_modules_DenseReluDense_modules_wi_1_parameters_weight_" + shape = [1024, 512] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.044 + data = None + + +class Program_weight_tensor_meta_L_self_modules_block_modules_3_modules_layer_modules_2_modules_DenseReluDense_modules_wo_parameters_weight_: + name = "L_self_modules_block_modules_3_modules_layer_modules_2_modules_DenseReluDense_modules_wo_parameters_weight_" + shape = [512, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.031 + data = None + + +class Program_weight_tensor_meta_L_self_modules_block_modules_4_modules_layer_modules_0_modules_layer_norm_parameters_weight_: + name = "L_self_modules_block_modules_4_modules_layer_modules_0_modules_layer_norm_parameters_weight_" + shape = [512] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_block_modules_4_modules_layer_modules_0_modules_SelfAttention_modules_q_parameters_weight_: + name = "L_self_modules_block_modules_4_modules_layer_modules_0_modules_SelfAttention_modules_q_parameters_weight_" + shape = [384, 512] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.006 + data = None + + +class Program_weight_tensor_meta_L_self_modules_block_modules_4_modules_layer_modules_0_modules_SelfAttention_modules_k_parameters_weight_: + name = "L_self_modules_block_modules_4_modules_layer_modules_0_modules_SelfAttention_modules_k_parameters_weight_" + shape = [384, 512] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.044 + data = None + + +class Program_weight_tensor_meta_L_self_modules_block_modules_4_modules_layer_modules_0_modules_SelfAttention_modules_v_parameters_weight_: + name = "L_self_modules_block_modules_4_modules_layer_modules_0_modules_SelfAttention_modules_v_parameters_weight_" + shape = [384, 512] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.044 + data = None + + +class Program_weight_tensor_meta_L_self_modules_block_modules_4_modules_layer_modules_0_modules_SelfAttention_modules_o_parameters_weight_: + name = "L_self_modules_block_modules_4_modules_layer_modules_0_modules_SelfAttention_modules_o_parameters_weight_" + shape = [512, 384] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.051 + data = None + + +class Program_weight_tensor_meta_L_self_modules_block_modules_4_modules_layer_modules_1_modules_layer_norm_parameters_weight_: + name = "L_self_modules_block_modules_4_modules_layer_modules_1_modules_layer_norm_parameters_weight_" + shape = [512] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_block_modules_4_modules_layer_modules_1_modules_EncDecAttention_modules_q_parameters_weight_: + name = "L_self_modules_block_modules_4_modules_layer_modules_1_modules_EncDecAttention_modules_q_parameters_weight_" + shape = [384, 512] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.006 + data = None + + +class Program_weight_tensor_meta_L_self_modules_block_modules_4_modules_layer_modules_1_modules_EncDecAttention_modules_k_parameters_weight_: + name = "L_self_modules_block_modules_4_modules_layer_modules_1_modules_EncDecAttention_modules_k_parameters_weight_" + shape = [384, 512] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.044 + data = None + + +class Program_weight_tensor_meta_L_self_modules_block_modules_4_modules_layer_modules_1_modules_EncDecAttention_modules_v_parameters_weight_: + name = "L_self_modules_block_modules_4_modules_layer_modules_1_modules_EncDecAttention_modules_v_parameters_weight_" + shape = [384, 512] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.044 + data = None + + +class Program_weight_tensor_meta_L_self_modules_block_modules_4_modules_layer_modules_1_modules_EncDecAttention_modules_o_parameters_weight_: + name = "L_self_modules_block_modules_4_modules_layer_modules_1_modules_EncDecAttention_modules_o_parameters_weight_" + shape = [512, 384] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.051 + data = None + + +class Program_weight_tensor_meta_L_self_modules_block_modules_4_modules_layer_modules_2_modules_layer_norm_parameters_weight_: + name = "L_self_modules_block_modules_4_modules_layer_modules_2_modules_layer_norm_parameters_weight_" + shape = [512] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_block_modules_4_modules_layer_modules_2_modules_DenseReluDense_modules_wi_0_parameters_weight_: + name = "L_self_modules_block_modules_4_modules_layer_modules_2_modules_DenseReluDense_modules_wi_0_parameters_weight_" + shape = [1024, 512] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.044 + data = None + + +class Program_weight_tensor_meta_L_self_modules_block_modules_4_modules_layer_modules_2_modules_DenseReluDense_modules_wi_1_parameters_weight_: + name = "L_self_modules_block_modules_4_modules_layer_modules_2_modules_DenseReluDense_modules_wi_1_parameters_weight_" + shape = [1024, 512] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.044 + data = None + + +class Program_weight_tensor_meta_L_self_modules_block_modules_4_modules_layer_modules_2_modules_DenseReluDense_modules_wo_parameters_weight_: + name = "L_self_modules_block_modules_4_modules_layer_modules_2_modules_DenseReluDense_modules_wo_parameters_weight_" + shape = [512, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.031 + data = None + + +class Program_weight_tensor_meta_L_self_modules_block_modules_5_modules_layer_modules_0_modules_layer_norm_parameters_weight_: + name = "L_self_modules_block_modules_5_modules_layer_modules_0_modules_layer_norm_parameters_weight_" + shape = [512] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_block_modules_5_modules_layer_modules_0_modules_SelfAttention_modules_q_parameters_weight_: + name = "L_self_modules_block_modules_5_modules_layer_modules_0_modules_SelfAttention_modules_q_parameters_weight_" + shape = [384, 512] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.006 + data = None + + +class Program_weight_tensor_meta_L_self_modules_block_modules_5_modules_layer_modules_0_modules_SelfAttention_modules_k_parameters_weight_: + name = "L_self_modules_block_modules_5_modules_layer_modules_0_modules_SelfAttention_modules_k_parameters_weight_" + shape = [384, 512] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.044 + data = None + + +class Program_weight_tensor_meta_L_self_modules_block_modules_5_modules_layer_modules_0_modules_SelfAttention_modules_v_parameters_weight_: + name = "L_self_modules_block_modules_5_modules_layer_modules_0_modules_SelfAttention_modules_v_parameters_weight_" + shape = [384, 512] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.044 + data = None + + +class Program_weight_tensor_meta_L_self_modules_block_modules_5_modules_layer_modules_0_modules_SelfAttention_modules_o_parameters_weight_: + name = "L_self_modules_block_modules_5_modules_layer_modules_0_modules_SelfAttention_modules_o_parameters_weight_" + shape = [512, 384] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.051 + data = None + + +class Program_weight_tensor_meta_L_self_modules_block_modules_5_modules_layer_modules_1_modules_layer_norm_parameters_weight_: + name = "L_self_modules_block_modules_5_modules_layer_modules_1_modules_layer_norm_parameters_weight_" + shape = [512] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_block_modules_5_modules_layer_modules_1_modules_EncDecAttention_modules_q_parameters_weight_: + name = "L_self_modules_block_modules_5_modules_layer_modules_1_modules_EncDecAttention_modules_q_parameters_weight_" + shape = [384, 512] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.006 + data = None + + +class Program_weight_tensor_meta_L_self_modules_block_modules_5_modules_layer_modules_1_modules_EncDecAttention_modules_k_parameters_weight_: + name = "L_self_modules_block_modules_5_modules_layer_modules_1_modules_EncDecAttention_modules_k_parameters_weight_" + shape = [384, 512] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.044 + data = None + + +class Program_weight_tensor_meta_L_self_modules_block_modules_5_modules_layer_modules_1_modules_EncDecAttention_modules_v_parameters_weight_: + name = "L_self_modules_block_modules_5_modules_layer_modules_1_modules_EncDecAttention_modules_v_parameters_weight_" + shape = [384, 512] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.044 + data = None + + +class Program_weight_tensor_meta_L_self_modules_block_modules_5_modules_layer_modules_1_modules_EncDecAttention_modules_o_parameters_weight_: + name = "L_self_modules_block_modules_5_modules_layer_modules_1_modules_EncDecAttention_modules_o_parameters_weight_" + shape = [512, 384] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.051 + data = None + + +class Program_weight_tensor_meta_L_self_modules_block_modules_5_modules_layer_modules_2_modules_layer_norm_parameters_weight_: + name = "L_self_modules_block_modules_5_modules_layer_modules_2_modules_layer_norm_parameters_weight_" + shape = [512] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_block_modules_5_modules_layer_modules_2_modules_DenseReluDense_modules_wi_0_parameters_weight_: + name = "L_self_modules_block_modules_5_modules_layer_modules_2_modules_DenseReluDense_modules_wi_0_parameters_weight_" + shape = [1024, 512] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.044 + data = None + + +class Program_weight_tensor_meta_L_self_modules_block_modules_5_modules_layer_modules_2_modules_DenseReluDense_modules_wi_1_parameters_weight_: + name = "L_self_modules_block_modules_5_modules_layer_modules_2_modules_DenseReluDense_modules_wi_1_parameters_weight_" + shape = [1024, 512] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.044 + data = None + + +class Program_weight_tensor_meta_L_self_modules_block_modules_5_modules_layer_modules_2_modules_DenseReluDense_modules_wo_parameters_weight_: + name = "L_self_modules_block_modules_5_modules_layer_modules_2_modules_DenseReluDense_modules_wo_parameters_weight_" + shape = [512, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.031 + data = None + + +class Program_weight_tensor_meta_L_self_modules_block_modules_6_modules_layer_modules_0_modules_layer_norm_parameters_weight_: + name = "L_self_modules_block_modules_6_modules_layer_modules_0_modules_layer_norm_parameters_weight_" + shape = [512] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_block_modules_6_modules_layer_modules_0_modules_SelfAttention_modules_q_parameters_weight_: + name = "L_self_modules_block_modules_6_modules_layer_modules_0_modules_SelfAttention_modules_q_parameters_weight_" + shape = [384, 512] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.006 + data = None + + +class Program_weight_tensor_meta_L_self_modules_block_modules_6_modules_layer_modules_0_modules_SelfAttention_modules_k_parameters_weight_: + name = "L_self_modules_block_modules_6_modules_layer_modules_0_modules_SelfAttention_modules_k_parameters_weight_" + shape = [384, 512] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.044 + data = None + + +class Program_weight_tensor_meta_L_self_modules_block_modules_6_modules_layer_modules_0_modules_SelfAttention_modules_v_parameters_weight_: + name = "L_self_modules_block_modules_6_modules_layer_modules_0_modules_SelfAttention_modules_v_parameters_weight_" + shape = [384, 512] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.044 + data = None + + +class Program_weight_tensor_meta_L_self_modules_block_modules_6_modules_layer_modules_0_modules_SelfAttention_modules_o_parameters_weight_: + name = "L_self_modules_block_modules_6_modules_layer_modules_0_modules_SelfAttention_modules_o_parameters_weight_" + shape = [512, 384] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.051 + data = None + + +class Program_weight_tensor_meta_L_self_modules_block_modules_6_modules_layer_modules_1_modules_layer_norm_parameters_weight_: + name = "L_self_modules_block_modules_6_modules_layer_modules_1_modules_layer_norm_parameters_weight_" + shape = [512] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_block_modules_6_modules_layer_modules_1_modules_EncDecAttention_modules_q_parameters_weight_: + name = "L_self_modules_block_modules_6_modules_layer_modules_1_modules_EncDecAttention_modules_q_parameters_weight_" + shape = [384, 512] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.006 + data = None + + +class Program_weight_tensor_meta_L_self_modules_block_modules_6_modules_layer_modules_1_modules_EncDecAttention_modules_k_parameters_weight_: + name = "L_self_modules_block_modules_6_modules_layer_modules_1_modules_EncDecAttention_modules_k_parameters_weight_" + shape = [384, 512] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.044 + data = None + + +class Program_weight_tensor_meta_L_self_modules_block_modules_6_modules_layer_modules_1_modules_EncDecAttention_modules_v_parameters_weight_: + name = "L_self_modules_block_modules_6_modules_layer_modules_1_modules_EncDecAttention_modules_v_parameters_weight_" + shape = [384, 512] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.044 + data = None + + +class Program_weight_tensor_meta_L_self_modules_block_modules_6_modules_layer_modules_1_modules_EncDecAttention_modules_o_parameters_weight_: + name = "L_self_modules_block_modules_6_modules_layer_modules_1_modules_EncDecAttention_modules_o_parameters_weight_" + shape = [512, 384] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.051 + data = None + + +class Program_weight_tensor_meta_L_self_modules_block_modules_6_modules_layer_modules_2_modules_layer_norm_parameters_weight_: + name = "L_self_modules_block_modules_6_modules_layer_modules_2_modules_layer_norm_parameters_weight_" + shape = [512] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_block_modules_6_modules_layer_modules_2_modules_DenseReluDense_modules_wi_0_parameters_weight_: + name = "L_self_modules_block_modules_6_modules_layer_modules_2_modules_DenseReluDense_modules_wi_0_parameters_weight_" + shape = [1024, 512] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.044 + data = None + + +class Program_weight_tensor_meta_L_self_modules_block_modules_6_modules_layer_modules_2_modules_DenseReluDense_modules_wi_1_parameters_weight_: + name = "L_self_modules_block_modules_6_modules_layer_modules_2_modules_DenseReluDense_modules_wi_1_parameters_weight_" + shape = [1024, 512] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.044 + data = None + + +class Program_weight_tensor_meta_L_self_modules_block_modules_6_modules_layer_modules_2_modules_DenseReluDense_modules_wo_parameters_weight_: + name = "L_self_modules_block_modules_6_modules_layer_modules_2_modules_DenseReluDense_modules_wo_parameters_weight_" + shape = [512, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.031 + data = None + + +class Program_weight_tensor_meta_L_self_modules_block_modules_7_modules_layer_modules_0_modules_layer_norm_parameters_weight_: + name = "L_self_modules_block_modules_7_modules_layer_modules_0_modules_layer_norm_parameters_weight_" + shape = [512] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_block_modules_7_modules_layer_modules_0_modules_SelfAttention_modules_q_parameters_weight_: + name = "L_self_modules_block_modules_7_modules_layer_modules_0_modules_SelfAttention_modules_q_parameters_weight_" + shape = [384, 512] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.006 + data = None + + +class Program_weight_tensor_meta_L_self_modules_block_modules_7_modules_layer_modules_0_modules_SelfAttention_modules_k_parameters_weight_: + name = "L_self_modules_block_modules_7_modules_layer_modules_0_modules_SelfAttention_modules_k_parameters_weight_" + shape = [384, 512] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.044 + data = None + + +class Program_weight_tensor_meta_L_self_modules_block_modules_7_modules_layer_modules_0_modules_SelfAttention_modules_v_parameters_weight_: + name = "L_self_modules_block_modules_7_modules_layer_modules_0_modules_SelfAttention_modules_v_parameters_weight_" + shape = [384, 512] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.044 + data = None + + +class Program_weight_tensor_meta_L_self_modules_block_modules_7_modules_layer_modules_0_modules_SelfAttention_modules_o_parameters_weight_: + name = "L_self_modules_block_modules_7_modules_layer_modules_0_modules_SelfAttention_modules_o_parameters_weight_" + shape = [512, 384] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.051 + data = None + + +class Program_weight_tensor_meta_L_self_modules_block_modules_7_modules_layer_modules_1_modules_layer_norm_parameters_weight_: + name = "L_self_modules_block_modules_7_modules_layer_modules_1_modules_layer_norm_parameters_weight_" + shape = [512] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_block_modules_7_modules_layer_modules_1_modules_EncDecAttention_modules_q_parameters_weight_: + name = "L_self_modules_block_modules_7_modules_layer_modules_1_modules_EncDecAttention_modules_q_parameters_weight_" + shape = [384, 512] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.006 + data = None + + +class Program_weight_tensor_meta_L_self_modules_block_modules_7_modules_layer_modules_1_modules_EncDecAttention_modules_k_parameters_weight_: + name = "L_self_modules_block_modules_7_modules_layer_modules_1_modules_EncDecAttention_modules_k_parameters_weight_" + shape = [384, 512] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.044 + data = None + + +class Program_weight_tensor_meta_L_self_modules_block_modules_7_modules_layer_modules_1_modules_EncDecAttention_modules_v_parameters_weight_: + name = "L_self_modules_block_modules_7_modules_layer_modules_1_modules_EncDecAttention_modules_v_parameters_weight_" + shape = [384, 512] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.044 + data = None + + +class Program_weight_tensor_meta_L_self_modules_block_modules_7_modules_layer_modules_1_modules_EncDecAttention_modules_o_parameters_weight_: + name = "L_self_modules_block_modules_7_modules_layer_modules_1_modules_EncDecAttention_modules_o_parameters_weight_" + shape = [512, 384] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.051 + data = None + + +class Program_weight_tensor_meta_L_self_modules_block_modules_7_modules_layer_modules_2_modules_layer_norm_parameters_weight_: + name = "L_self_modules_block_modules_7_modules_layer_modules_2_modules_layer_norm_parameters_weight_" + shape = [512] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_block_modules_7_modules_layer_modules_2_modules_DenseReluDense_modules_wi_0_parameters_weight_: + name = "L_self_modules_block_modules_7_modules_layer_modules_2_modules_DenseReluDense_modules_wi_0_parameters_weight_" + shape = [1024, 512] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.044 + data = None + + +class Program_weight_tensor_meta_L_self_modules_block_modules_7_modules_layer_modules_2_modules_DenseReluDense_modules_wi_1_parameters_weight_: + name = "L_self_modules_block_modules_7_modules_layer_modules_2_modules_DenseReluDense_modules_wi_1_parameters_weight_" + shape = [1024, 512] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.044 + data = None + + +class Program_weight_tensor_meta_L_self_modules_block_modules_7_modules_layer_modules_2_modules_DenseReluDense_modules_wo_parameters_weight_: + name = "L_self_modules_block_modules_7_modules_layer_modules_2_modules_DenseReluDense_modules_wo_parameters_weight_" + shape = [512, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.031 + data = None + + +class Program_weight_tensor_meta_L_self_modules_final_layer_norm_parameters_weight_: + name = "L_self_modules_final_layer_norm_parameters_weight_" + shape = [512] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None diff --git a/samples/transformers-auto-model/muhtasham_medium-vanilla-target-tweet/graph_hash.txt b/samples/transformers-auto-model/muhtasham_medium-vanilla-target-tweet/graph_hash.txt new file mode 100644 index 000000000..4dd8182bb --- /dev/null +++ b/samples/transformers-auto-model/muhtasham_medium-vanilla-target-tweet/graph_hash.txt @@ -0,0 +1 @@ +25d70ec1e9e0323f3f9a5c94ae84725ef25d25a72e70b8b07c38d3ceb5a4fee7 \ No newline at end of file diff --git a/samples/transformers-auto-model/muhtasham_medium-vanilla-target-tweet/graph_net.json b/samples/transformers-auto-model/muhtasham_medium-vanilla-target-tweet/graph_net.json new file mode 100644 index 000000000..b6ffe9f72 --- /dev/null +++ b/samples/transformers-auto-model/muhtasham_medium-vanilla-target-tweet/graph_net.json @@ -0,0 +1,6 @@ +{ + "framework": "torch", + "num_devices_required": 1, + "num_nodes_required": 1, + "dynamic": false +} \ No newline at end of file diff --git a/samples/transformers-auto-model/muhtasham_medium-vanilla-target-tweet/input_meta.py b/samples/transformers-auto-model/muhtasham_medium-vanilla-target-tweet/input_meta.py new file mode 100644 index 000000000..e69de29bb diff --git a/samples/transformers-auto-model/muhtasham_medium-vanilla-target-tweet/input_tensor_constraints.py b/samples/transformers-auto-model/muhtasham_medium-vanilla-target-tweet/input_tensor_constraints.py new file mode 100644 index 000000000..e69de29bb diff --git a/samples/transformers-auto-model/muhtasham_medium-vanilla-target-tweet/model.py b/samples/transformers-auto-model/muhtasham_medium-vanilla-target-tweet/model.py new file mode 100644 index 000000000..32ce633a1 --- /dev/null +++ b/samples/transformers-auto-model/muhtasham_medium-vanilla-target-tweet/model.py @@ -0,0 +1,1112 @@ +import torch + + +class GraphModule(torch.nn.Module): + def forward( + self, + L_input_ids_: torch.Tensor, + L_token_type_ids_: torch.Tensor, + L_self_modules_embeddings_buffers_position_ids_: torch.Tensor, + L_self_modules_embeddings_modules_word_embeddings_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_embeddings_modules_token_type_embeddings_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_embeddings_modules_position_embeddings_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_embeddings_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_embeddings_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_attention_mask_: torch.Tensor, + L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_query_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_query_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_key_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_key_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_intermediate_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_intermediate_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_query_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_query_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_key_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_key_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_intermediate_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_intermediate_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_query_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_query_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_key_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_key_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_intermediate_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_intermediate_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_query_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_query_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_key_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_key_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_intermediate_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_intermediate_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_query_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_query_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_key_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_key_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_4_modules_intermediate_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_4_modules_intermediate_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_4_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_4_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_4_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_4_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_query_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_query_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_key_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_key_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_5_modules_intermediate_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_5_modules_intermediate_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_5_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_5_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_5_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_5_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_query_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_query_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_key_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_key_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_6_modules_intermediate_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_6_modules_intermediate_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_6_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_6_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_6_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_6_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_query_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_query_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_key_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_key_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_7_modules_intermediate_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_7_modules_intermediate_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_7_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_7_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_7_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_7_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_pooler_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_pooler_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + ): + l_input_ids_ = L_input_ids_ + l_token_type_ids_ = L_token_type_ids_ + l_self_modules_embeddings_buffers_position_ids_ = ( + L_self_modules_embeddings_buffers_position_ids_ + ) + l_self_modules_embeddings_modules_word_embeddings_parameters_weight_ = ( + L_self_modules_embeddings_modules_word_embeddings_parameters_weight_ + ) + l_self_modules_embeddings_modules_token_type_embeddings_parameters_weight_ = ( + L_self_modules_embeddings_modules_token_type_embeddings_parameters_weight_ + ) + l_self_modules_embeddings_modules_position_embeddings_parameters_weight_ = ( + L_self_modules_embeddings_modules_position_embeddings_parameters_weight_ + ) + l_self_modules_embeddings_modules_layer_norm_parameters_weight_ = ( + L_self_modules_embeddings_modules_LayerNorm_parameters_weight_ + ) + l_self_modules_embeddings_modules_layer_norm_parameters_bias_ = ( + L_self_modules_embeddings_modules_LayerNorm_parameters_bias_ + ) + l_attention_mask_ = L_attention_mask_ + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_query_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_query_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_query_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_query_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_key_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_key_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_key_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_key_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_value_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_value_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_value_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_value_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_0_modules_intermediate_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_0_modules_intermediate_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_0_modules_intermediate_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_0_modules_intermediate_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_0_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_0_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_query_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_query_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_query_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_query_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_key_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_key_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_key_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_key_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_value_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_value_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_value_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_value_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_1_modules_intermediate_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_1_modules_intermediate_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_1_modules_intermediate_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_1_modules_intermediate_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_1_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_1_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_query_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_query_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_query_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_query_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_key_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_key_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_key_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_key_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_value_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_value_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_value_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_value_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_2_modules_intermediate_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_2_modules_intermediate_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_2_modules_intermediate_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_2_modules_intermediate_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_2_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_2_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_2_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_2_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_query_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_query_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_query_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_query_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_key_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_key_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_key_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_key_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_value_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_value_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_value_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_value_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_3_modules_intermediate_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_3_modules_intermediate_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_3_modules_intermediate_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_3_modules_intermediate_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_3_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_3_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_3_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_3_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_query_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_query_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_query_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_query_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_key_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_key_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_key_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_key_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_value_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_value_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_value_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_value_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_4_modules_intermediate_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_4_modules_intermediate_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_4_modules_intermediate_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_4_modules_intermediate_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_4_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_4_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_4_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_4_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_4_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_4_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_4_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_4_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_query_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_query_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_query_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_query_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_key_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_key_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_key_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_key_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_value_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_value_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_value_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_value_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_5_modules_intermediate_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_5_modules_intermediate_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_5_modules_intermediate_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_5_modules_intermediate_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_5_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_5_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_5_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_5_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_5_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_5_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_5_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_5_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_query_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_query_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_query_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_query_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_key_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_key_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_key_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_key_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_value_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_value_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_value_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_value_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_6_modules_intermediate_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_6_modules_intermediate_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_6_modules_intermediate_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_6_modules_intermediate_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_6_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_6_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_6_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_6_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_6_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_6_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_6_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_6_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_query_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_query_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_query_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_query_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_key_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_key_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_key_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_key_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_value_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_value_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_value_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_value_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_7_modules_intermediate_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_7_modules_intermediate_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_7_modules_intermediate_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_7_modules_intermediate_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_7_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_7_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_7_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_7_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_7_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_7_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_7_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_7_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_pooler_modules_dense_parameters_weight_ = ( + L_self_modules_pooler_modules_dense_parameters_weight_ + ) + l_self_modules_pooler_modules_dense_parameters_bias_ = ( + L_self_modules_pooler_modules_dense_parameters_bias_ + ) + position_ids = l_self_modules_embeddings_buffers_position_ids_[ + (slice(None, None, None), slice(0, 11, None)) + ] + l_self_modules_embeddings_buffers_position_ids_ = None + inputs_embeds = torch.nn.functional.embedding( + l_input_ids_, + l_self_modules_embeddings_modules_word_embeddings_parameters_weight_, + 0, + None, + 2.0, + False, + False, + ) + l_input_ids_ = ( + l_self_modules_embeddings_modules_word_embeddings_parameters_weight_ + ) = None + token_type_embeddings = torch.nn.functional.embedding( + l_token_type_ids_, + l_self_modules_embeddings_modules_token_type_embeddings_parameters_weight_, + None, + None, + 2.0, + False, + False, + ) + l_token_type_ids_ = ( + l_self_modules_embeddings_modules_token_type_embeddings_parameters_weight_ + ) = None + embeddings = inputs_embeds + token_type_embeddings + inputs_embeds = token_type_embeddings = None + position_embeddings = torch.nn.functional.embedding( + position_ids, + l_self_modules_embeddings_modules_position_embeddings_parameters_weight_, + None, + None, + 2.0, + False, + False, + ) + position_ids = ( + l_self_modules_embeddings_modules_position_embeddings_parameters_weight_ + ) = None + embeddings += position_embeddings + embeddings_1 = embeddings + embeddings = position_embeddings = None + embeddings_2 = torch.nn.functional.layer_norm( + embeddings_1, + (512,), + l_self_modules_embeddings_modules_layer_norm_parameters_weight_, + l_self_modules_embeddings_modules_layer_norm_parameters_bias_, + 1e-12, + ) + embeddings_1 = ( + l_self_modules_embeddings_modules_layer_norm_parameters_weight_ + ) = l_self_modules_embeddings_modules_layer_norm_parameters_bias_ = None + embeddings_3 = torch.nn.functional.dropout(embeddings_2, 0.1, False, False) + embeddings_2 = None + getitem_1 = l_attention_mask_[ + (slice(None, None, None), None, None, slice(None, None, None)) + ] + l_attention_mask_ = None + expand = getitem_1.expand(1, 1, 11, 11) + getitem_1 = None + expanded_mask = expand.to(torch.float32) + expand = None + tensor = torch.tensor(1.0, dtype=torch.float32) + inverted_mask = tensor - expanded_mask + tensor = expanded_mask = None + to_1 = inverted_mask.to(torch.bool) + extended_attention_mask = inverted_mask.masked_fill( + to_1, -3.4028234663852886e38 + ) + inverted_mask = to_1 = None + linear = torch._C._nn.linear( + embeddings_3, + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_query_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_query_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_query_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_query_parameters_bias_ = (None) + view = linear.view(1, -1, 8, 64) + linear = None + query_layer = view.transpose(1, 2) + view = None + linear_1 = torch._C._nn.linear( + embeddings_3, + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_key_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_key_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_key_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_key_parameters_bias_ = (None) + view_1 = linear_1.view(1, -1, 8, 64) + linear_1 = None + key_layer = view_1.transpose(1, 2) + view_1 = None + linear_2 = torch._C._nn.linear( + embeddings_3, + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_value_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_value_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_value_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_value_parameters_bias_ = (None) + view_2 = linear_2.view(1, -1, 8, 64) + linear_2 = None + value_layer = view_2.transpose(1, 2) + view_2 = None + attn_output = torch._C._nn.scaled_dot_product_attention( + query_layer, + key_layer, + value_layer, + attn_mask=extended_attention_mask, + dropout_p=0.0, + is_causal=False, + ) + query_layer = key_layer = value_layer = None + attn_output_1 = attn_output.transpose(1, 2) + attn_output = None + attn_output_2 = attn_output_1.reshape(1, 11, 512) + attn_output_1 = None + hidden_states = torch._C._nn.linear( + attn_output_2, + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dense_parameters_bias_, + ) + attn_output_2 = l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_1 = torch.nn.functional.dropout(hidden_states, 0.1, False, False) + hidden_states = None + add_1 = hidden_states_1 + embeddings_3 + hidden_states_1 = embeddings_3 = None + hidden_states_2 = torch.nn.functional.layer_norm( + add_1, + (512,), + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_layer_norm_parameters_bias_, + 1e-12, + ) + add_1 = l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = (None) + hidden_states_3 = torch._C._nn.linear( + hidden_states_2, + l_self_modules_encoder_modules_layer_modules_0_modules_intermediate_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_0_modules_intermediate_modules_dense_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_0_modules_intermediate_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_0_modules_intermediate_modules_dense_parameters_bias_ = (None) + hidden_states_4 = torch._C._nn.gelu(hidden_states_3) + hidden_states_3 = None + hidden_states_5 = torch._C._nn.linear( + hidden_states_4, + l_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dense_parameters_bias_, + ) + hidden_states_4 = l_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_6 = torch.nn.functional.dropout( + hidden_states_5, 0.1, False, False + ) + hidden_states_5 = None + add_2 = hidden_states_6 + hidden_states_2 + hidden_states_6 = hidden_states_2 = None + hidden_states_7 = torch.nn.functional.layer_norm( + add_2, + (512,), + l_self_modules_encoder_modules_layer_modules_0_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_0_modules_output_modules_layer_norm_parameters_bias_, + 1e-12, + ) + add_2 = l_self_modules_encoder_modules_layer_modules_0_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_0_modules_output_modules_layer_norm_parameters_bias_ = (None) + linear_6 = torch._C._nn.linear( + hidden_states_7, + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_query_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_query_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_query_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_query_parameters_bias_ = (None) + view_3 = linear_6.view(1, -1, 8, 64) + linear_6 = None + query_layer_1 = view_3.transpose(1, 2) + view_3 = None + linear_7 = torch._C._nn.linear( + hidden_states_7, + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_key_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_key_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_key_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_key_parameters_bias_ = (None) + view_4 = linear_7.view(1, -1, 8, 64) + linear_7 = None + key_layer_1 = view_4.transpose(1, 2) + view_4 = None + linear_8 = torch._C._nn.linear( + hidden_states_7, + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_value_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_value_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_value_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_value_parameters_bias_ = (None) + view_5 = linear_8.view(1, -1, 8, 64) + linear_8 = None + value_layer_1 = view_5.transpose(1, 2) + view_5 = None + attn_output_3 = torch._C._nn.scaled_dot_product_attention( + query_layer_1, + key_layer_1, + value_layer_1, + attn_mask=extended_attention_mask, + dropout_p=0.0, + is_causal=False, + ) + query_layer_1 = key_layer_1 = value_layer_1 = None + attn_output_4 = attn_output_3.transpose(1, 2) + attn_output_3 = None + attn_output_5 = attn_output_4.reshape(1, 11, 512) + attn_output_4 = None + hidden_states_8 = torch._C._nn.linear( + attn_output_5, + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dense_parameters_bias_, + ) + attn_output_5 = l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_9 = torch.nn.functional.dropout( + hidden_states_8, 0.1, False, False + ) + hidden_states_8 = None + add_3 = hidden_states_9 + hidden_states_7 + hidden_states_9 = hidden_states_7 = None + hidden_states_10 = torch.nn.functional.layer_norm( + add_3, + (512,), + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_layer_norm_parameters_bias_, + 1e-12, + ) + add_3 = l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = (None) + hidden_states_11 = torch._C._nn.linear( + hidden_states_10, + l_self_modules_encoder_modules_layer_modules_1_modules_intermediate_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_1_modules_intermediate_modules_dense_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_1_modules_intermediate_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_1_modules_intermediate_modules_dense_parameters_bias_ = (None) + hidden_states_12 = torch._C._nn.gelu(hidden_states_11) + hidden_states_11 = None + hidden_states_13 = torch._C._nn.linear( + hidden_states_12, + l_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dense_parameters_bias_, + ) + hidden_states_12 = l_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_14 = torch.nn.functional.dropout( + hidden_states_13, 0.1, False, False + ) + hidden_states_13 = None + add_4 = hidden_states_14 + hidden_states_10 + hidden_states_14 = hidden_states_10 = None + hidden_states_15 = torch.nn.functional.layer_norm( + add_4, + (512,), + l_self_modules_encoder_modules_layer_modules_1_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_1_modules_output_modules_layer_norm_parameters_bias_, + 1e-12, + ) + add_4 = l_self_modules_encoder_modules_layer_modules_1_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_1_modules_output_modules_layer_norm_parameters_bias_ = (None) + linear_12 = torch._C._nn.linear( + hidden_states_15, + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_query_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_query_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_query_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_query_parameters_bias_ = (None) + view_6 = linear_12.view(1, -1, 8, 64) + linear_12 = None + query_layer_2 = view_6.transpose(1, 2) + view_6 = None + linear_13 = torch._C._nn.linear( + hidden_states_15, + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_key_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_key_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_key_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_key_parameters_bias_ = (None) + view_7 = linear_13.view(1, -1, 8, 64) + linear_13 = None + key_layer_2 = view_7.transpose(1, 2) + view_7 = None + linear_14 = torch._C._nn.linear( + hidden_states_15, + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_value_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_value_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_value_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_value_parameters_bias_ = (None) + view_8 = linear_14.view(1, -1, 8, 64) + linear_14 = None + value_layer_2 = view_8.transpose(1, 2) + view_8 = None + attn_output_6 = torch._C._nn.scaled_dot_product_attention( + query_layer_2, + key_layer_2, + value_layer_2, + attn_mask=extended_attention_mask, + dropout_p=0.0, + is_causal=False, + ) + query_layer_2 = key_layer_2 = value_layer_2 = None + attn_output_7 = attn_output_6.transpose(1, 2) + attn_output_6 = None + attn_output_8 = attn_output_7.reshape(1, 11, 512) + attn_output_7 = None + hidden_states_16 = torch._C._nn.linear( + attn_output_8, + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_dense_parameters_bias_, + ) + attn_output_8 = l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_17 = torch.nn.functional.dropout( + hidden_states_16, 0.1, False, False + ) + hidden_states_16 = None + add_5 = hidden_states_17 + hidden_states_15 + hidden_states_17 = hidden_states_15 = None + hidden_states_18 = torch.nn.functional.layer_norm( + add_5, + (512,), + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_layer_norm_parameters_bias_, + 1e-12, + ) + add_5 = l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = (None) + hidden_states_19 = torch._C._nn.linear( + hidden_states_18, + l_self_modules_encoder_modules_layer_modules_2_modules_intermediate_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_2_modules_intermediate_modules_dense_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_2_modules_intermediate_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_2_modules_intermediate_modules_dense_parameters_bias_ = (None) + hidden_states_20 = torch._C._nn.gelu(hidden_states_19) + hidden_states_19 = None + hidden_states_21 = torch._C._nn.linear( + hidden_states_20, + l_self_modules_encoder_modules_layer_modules_2_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_2_modules_output_modules_dense_parameters_bias_, + ) + hidden_states_20 = l_self_modules_encoder_modules_layer_modules_2_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_2_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_22 = torch.nn.functional.dropout( + hidden_states_21, 0.1, False, False + ) + hidden_states_21 = None + add_6 = hidden_states_22 + hidden_states_18 + hidden_states_22 = hidden_states_18 = None + hidden_states_23 = torch.nn.functional.layer_norm( + add_6, + (512,), + l_self_modules_encoder_modules_layer_modules_2_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_2_modules_output_modules_layer_norm_parameters_bias_, + 1e-12, + ) + add_6 = l_self_modules_encoder_modules_layer_modules_2_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_2_modules_output_modules_layer_norm_parameters_bias_ = (None) + linear_18 = torch._C._nn.linear( + hidden_states_23, + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_query_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_query_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_query_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_query_parameters_bias_ = (None) + view_9 = linear_18.view(1, -1, 8, 64) + linear_18 = None + query_layer_3 = view_9.transpose(1, 2) + view_9 = None + linear_19 = torch._C._nn.linear( + hidden_states_23, + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_key_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_key_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_key_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_key_parameters_bias_ = (None) + view_10 = linear_19.view(1, -1, 8, 64) + linear_19 = None + key_layer_3 = view_10.transpose(1, 2) + view_10 = None + linear_20 = torch._C._nn.linear( + hidden_states_23, + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_value_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_value_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_value_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_value_parameters_bias_ = (None) + view_11 = linear_20.view(1, -1, 8, 64) + linear_20 = None + value_layer_3 = view_11.transpose(1, 2) + view_11 = None + attn_output_9 = torch._C._nn.scaled_dot_product_attention( + query_layer_3, + key_layer_3, + value_layer_3, + attn_mask=extended_attention_mask, + dropout_p=0.0, + is_causal=False, + ) + query_layer_3 = key_layer_3 = value_layer_3 = None + attn_output_10 = attn_output_9.transpose(1, 2) + attn_output_9 = None + attn_output_11 = attn_output_10.reshape(1, 11, 512) + attn_output_10 = None + hidden_states_24 = torch._C._nn.linear( + attn_output_11, + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_dense_parameters_bias_, + ) + attn_output_11 = l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_25 = torch.nn.functional.dropout( + hidden_states_24, 0.1, False, False + ) + hidden_states_24 = None + add_7 = hidden_states_25 + hidden_states_23 + hidden_states_25 = hidden_states_23 = None + hidden_states_26 = torch.nn.functional.layer_norm( + add_7, + (512,), + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_layer_norm_parameters_bias_, + 1e-12, + ) + add_7 = l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = (None) + hidden_states_27 = torch._C._nn.linear( + hidden_states_26, + l_self_modules_encoder_modules_layer_modules_3_modules_intermediate_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_3_modules_intermediate_modules_dense_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_3_modules_intermediate_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_3_modules_intermediate_modules_dense_parameters_bias_ = (None) + hidden_states_28 = torch._C._nn.gelu(hidden_states_27) + hidden_states_27 = None + hidden_states_29 = torch._C._nn.linear( + hidden_states_28, + l_self_modules_encoder_modules_layer_modules_3_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_3_modules_output_modules_dense_parameters_bias_, + ) + hidden_states_28 = l_self_modules_encoder_modules_layer_modules_3_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_3_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_30 = torch.nn.functional.dropout( + hidden_states_29, 0.1, False, False + ) + hidden_states_29 = None + add_8 = hidden_states_30 + hidden_states_26 + hidden_states_30 = hidden_states_26 = None + hidden_states_31 = torch.nn.functional.layer_norm( + add_8, + (512,), + l_self_modules_encoder_modules_layer_modules_3_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_3_modules_output_modules_layer_norm_parameters_bias_, + 1e-12, + ) + add_8 = l_self_modules_encoder_modules_layer_modules_3_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_3_modules_output_modules_layer_norm_parameters_bias_ = (None) + linear_24 = torch._C._nn.linear( + hidden_states_31, + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_query_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_query_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_query_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_query_parameters_bias_ = (None) + view_12 = linear_24.view(1, -1, 8, 64) + linear_24 = None + query_layer_4 = view_12.transpose(1, 2) + view_12 = None + linear_25 = torch._C._nn.linear( + hidden_states_31, + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_key_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_key_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_key_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_key_parameters_bias_ = (None) + view_13 = linear_25.view(1, -1, 8, 64) + linear_25 = None + key_layer_4 = view_13.transpose(1, 2) + view_13 = None + linear_26 = torch._C._nn.linear( + hidden_states_31, + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_value_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_value_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_value_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_value_parameters_bias_ = (None) + view_14 = linear_26.view(1, -1, 8, 64) + linear_26 = None + value_layer_4 = view_14.transpose(1, 2) + view_14 = None + attn_output_12 = torch._C._nn.scaled_dot_product_attention( + query_layer_4, + key_layer_4, + value_layer_4, + attn_mask=extended_attention_mask, + dropout_p=0.0, + is_causal=False, + ) + query_layer_4 = key_layer_4 = value_layer_4 = None + attn_output_13 = attn_output_12.transpose(1, 2) + attn_output_12 = None + attn_output_14 = attn_output_13.reshape(1, 11, 512) + attn_output_13 = None + hidden_states_32 = torch._C._nn.linear( + attn_output_14, + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_dense_parameters_bias_, + ) + attn_output_14 = l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_33 = torch.nn.functional.dropout( + hidden_states_32, 0.1, False, False + ) + hidden_states_32 = None + add_9 = hidden_states_33 + hidden_states_31 + hidden_states_33 = hidden_states_31 = None + hidden_states_34 = torch.nn.functional.layer_norm( + add_9, + (512,), + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_layer_norm_parameters_bias_, + 1e-12, + ) + add_9 = l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = (None) + hidden_states_35 = torch._C._nn.linear( + hidden_states_34, + l_self_modules_encoder_modules_layer_modules_4_modules_intermediate_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_4_modules_intermediate_modules_dense_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_4_modules_intermediate_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_4_modules_intermediate_modules_dense_parameters_bias_ = (None) + hidden_states_36 = torch._C._nn.gelu(hidden_states_35) + hidden_states_35 = None + hidden_states_37 = torch._C._nn.linear( + hidden_states_36, + l_self_modules_encoder_modules_layer_modules_4_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_4_modules_output_modules_dense_parameters_bias_, + ) + hidden_states_36 = l_self_modules_encoder_modules_layer_modules_4_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_4_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_38 = torch.nn.functional.dropout( + hidden_states_37, 0.1, False, False + ) + hidden_states_37 = None + add_10 = hidden_states_38 + hidden_states_34 + hidden_states_38 = hidden_states_34 = None + hidden_states_39 = torch.nn.functional.layer_norm( + add_10, + (512,), + l_self_modules_encoder_modules_layer_modules_4_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_4_modules_output_modules_layer_norm_parameters_bias_, + 1e-12, + ) + add_10 = l_self_modules_encoder_modules_layer_modules_4_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_4_modules_output_modules_layer_norm_parameters_bias_ = (None) + linear_30 = torch._C._nn.linear( + hidden_states_39, + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_query_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_query_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_query_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_query_parameters_bias_ = (None) + view_15 = linear_30.view(1, -1, 8, 64) + linear_30 = None + query_layer_5 = view_15.transpose(1, 2) + view_15 = None + linear_31 = torch._C._nn.linear( + hidden_states_39, + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_key_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_key_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_key_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_key_parameters_bias_ = (None) + view_16 = linear_31.view(1, -1, 8, 64) + linear_31 = None + key_layer_5 = view_16.transpose(1, 2) + view_16 = None + linear_32 = torch._C._nn.linear( + hidden_states_39, + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_value_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_value_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_value_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_value_parameters_bias_ = (None) + view_17 = linear_32.view(1, -1, 8, 64) + linear_32 = None + value_layer_5 = view_17.transpose(1, 2) + view_17 = None + attn_output_15 = torch._C._nn.scaled_dot_product_attention( + query_layer_5, + key_layer_5, + value_layer_5, + attn_mask=extended_attention_mask, + dropout_p=0.0, + is_causal=False, + ) + query_layer_5 = key_layer_5 = value_layer_5 = None + attn_output_16 = attn_output_15.transpose(1, 2) + attn_output_15 = None + attn_output_17 = attn_output_16.reshape(1, 11, 512) + attn_output_16 = None + hidden_states_40 = torch._C._nn.linear( + attn_output_17, + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_dense_parameters_bias_, + ) + attn_output_17 = l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_41 = torch.nn.functional.dropout( + hidden_states_40, 0.1, False, False + ) + hidden_states_40 = None + add_11 = hidden_states_41 + hidden_states_39 + hidden_states_41 = hidden_states_39 = None + hidden_states_42 = torch.nn.functional.layer_norm( + add_11, + (512,), + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_layer_norm_parameters_bias_, + 1e-12, + ) + add_11 = l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = (None) + hidden_states_43 = torch._C._nn.linear( + hidden_states_42, + l_self_modules_encoder_modules_layer_modules_5_modules_intermediate_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_5_modules_intermediate_modules_dense_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_5_modules_intermediate_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_5_modules_intermediate_modules_dense_parameters_bias_ = (None) + hidden_states_44 = torch._C._nn.gelu(hidden_states_43) + hidden_states_43 = None + hidden_states_45 = torch._C._nn.linear( + hidden_states_44, + l_self_modules_encoder_modules_layer_modules_5_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_5_modules_output_modules_dense_parameters_bias_, + ) + hidden_states_44 = l_self_modules_encoder_modules_layer_modules_5_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_5_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_46 = torch.nn.functional.dropout( + hidden_states_45, 0.1, False, False + ) + hidden_states_45 = None + add_12 = hidden_states_46 + hidden_states_42 + hidden_states_46 = hidden_states_42 = None + hidden_states_47 = torch.nn.functional.layer_norm( + add_12, + (512,), + l_self_modules_encoder_modules_layer_modules_5_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_5_modules_output_modules_layer_norm_parameters_bias_, + 1e-12, + ) + add_12 = l_self_modules_encoder_modules_layer_modules_5_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_5_modules_output_modules_layer_norm_parameters_bias_ = (None) + linear_36 = torch._C._nn.linear( + hidden_states_47, + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_query_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_query_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_query_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_query_parameters_bias_ = (None) + view_18 = linear_36.view(1, -1, 8, 64) + linear_36 = None + query_layer_6 = view_18.transpose(1, 2) + view_18 = None + linear_37 = torch._C._nn.linear( + hidden_states_47, + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_key_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_key_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_key_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_key_parameters_bias_ = (None) + view_19 = linear_37.view(1, -1, 8, 64) + linear_37 = None + key_layer_6 = view_19.transpose(1, 2) + view_19 = None + linear_38 = torch._C._nn.linear( + hidden_states_47, + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_value_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_value_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_value_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_value_parameters_bias_ = (None) + view_20 = linear_38.view(1, -1, 8, 64) + linear_38 = None + value_layer_6 = view_20.transpose(1, 2) + view_20 = None + attn_output_18 = torch._C._nn.scaled_dot_product_attention( + query_layer_6, + key_layer_6, + value_layer_6, + attn_mask=extended_attention_mask, + dropout_p=0.0, + is_causal=False, + ) + query_layer_6 = key_layer_6 = value_layer_6 = None + attn_output_19 = attn_output_18.transpose(1, 2) + attn_output_18 = None + attn_output_20 = attn_output_19.reshape(1, 11, 512) + attn_output_19 = None + hidden_states_48 = torch._C._nn.linear( + attn_output_20, + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_dense_parameters_bias_, + ) + attn_output_20 = l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_49 = torch.nn.functional.dropout( + hidden_states_48, 0.1, False, False + ) + hidden_states_48 = None + add_13 = hidden_states_49 + hidden_states_47 + hidden_states_49 = hidden_states_47 = None + hidden_states_50 = torch.nn.functional.layer_norm( + add_13, + (512,), + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_layer_norm_parameters_bias_, + 1e-12, + ) + add_13 = l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = (None) + hidden_states_51 = torch._C._nn.linear( + hidden_states_50, + l_self_modules_encoder_modules_layer_modules_6_modules_intermediate_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_6_modules_intermediate_modules_dense_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_6_modules_intermediate_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_6_modules_intermediate_modules_dense_parameters_bias_ = (None) + hidden_states_52 = torch._C._nn.gelu(hidden_states_51) + hidden_states_51 = None + hidden_states_53 = torch._C._nn.linear( + hidden_states_52, + l_self_modules_encoder_modules_layer_modules_6_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_6_modules_output_modules_dense_parameters_bias_, + ) + hidden_states_52 = l_self_modules_encoder_modules_layer_modules_6_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_6_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_54 = torch.nn.functional.dropout( + hidden_states_53, 0.1, False, False + ) + hidden_states_53 = None + add_14 = hidden_states_54 + hidden_states_50 + hidden_states_54 = hidden_states_50 = None + hidden_states_55 = torch.nn.functional.layer_norm( + add_14, + (512,), + l_self_modules_encoder_modules_layer_modules_6_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_6_modules_output_modules_layer_norm_parameters_bias_, + 1e-12, + ) + add_14 = l_self_modules_encoder_modules_layer_modules_6_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_6_modules_output_modules_layer_norm_parameters_bias_ = (None) + linear_42 = torch._C._nn.linear( + hidden_states_55, + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_query_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_query_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_query_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_query_parameters_bias_ = (None) + view_21 = linear_42.view(1, -1, 8, 64) + linear_42 = None + query_layer_7 = view_21.transpose(1, 2) + view_21 = None + linear_43 = torch._C._nn.linear( + hidden_states_55, + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_key_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_key_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_key_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_key_parameters_bias_ = (None) + view_22 = linear_43.view(1, -1, 8, 64) + linear_43 = None + key_layer_7 = view_22.transpose(1, 2) + view_22 = None + linear_44 = torch._C._nn.linear( + hidden_states_55, + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_value_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_value_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_value_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_value_parameters_bias_ = (None) + view_23 = linear_44.view(1, -1, 8, 64) + linear_44 = None + value_layer_7 = view_23.transpose(1, 2) + view_23 = None + attn_output_21 = torch._C._nn.scaled_dot_product_attention( + query_layer_7, + key_layer_7, + value_layer_7, + attn_mask=extended_attention_mask, + dropout_p=0.0, + is_causal=False, + ) + query_layer_7 = key_layer_7 = value_layer_7 = extended_attention_mask = None + attn_output_22 = attn_output_21.transpose(1, 2) + attn_output_21 = None + attn_output_23 = attn_output_22.reshape(1, 11, 512) + attn_output_22 = None + hidden_states_56 = torch._C._nn.linear( + attn_output_23, + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_dense_parameters_bias_, + ) + attn_output_23 = l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_57 = torch.nn.functional.dropout( + hidden_states_56, 0.1, False, False + ) + hidden_states_56 = None + add_15 = hidden_states_57 + hidden_states_55 + hidden_states_57 = hidden_states_55 = None + hidden_states_58 = torch.nn.functional.layer_norm( + add_15, + (512,), + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_layer_norm_parameters_bias_, + 1e-12, + ) + add_15 = l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = (None) + hidden_states_59 = torch._C._nn.linear( + hidden_states_58, + l_self_modules_encoder_modules_layer_modules_7_modules_intermediate_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_7_modules_intermediate_modules_dense_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_7_modules_intermediate_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_7_modules_intermediate_modules_dense_parameters_bias_ = (None) + hidden_states_60 = torch._C._nn.gelu(hidden_states_59) + hidden_states_59 = None + hidden_states_61 = torch._C._nn.linear( + hidden_states_60, + l_self_modules_encoder_modules_layer_modules_7_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_7_modules_output_modules_dense_parameters_bias_, + ) + hidden_states_60 = l_self_modules_encoder_modules_layer_modules_7_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_7_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_62 = torch.nn.functional.dropout( + hidden_states_61, 0.1, False, False + ) + hidden_states_61 = None + add_16 = hidden_states_62 + hidden_states_58 + hidden_states_62 = hidden_states_58 = None + hidden_states_63 = torch.nn.functional.layer_norm( + add_16, + (512,), + l_self_modules_encoder_modules_layer_modules_7_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_7_modules_output_modules_layer_norm_parameters_bias_, + 1e-12, + ) + add_16 = l_self_modules_encoder_modules_layer_modules_7_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_7_modules_output_modules_layer_norm_parameters_bias_ = (None) + first_token_tensor = hidden_states_63[(slice(None, None, None), 0)] + pooled_output = torch._C._nn.linear( + first_token_tensor, + l_self_modules_pooler_modules_dense_parameters_weight_, + l_self_modules_pooler_modules_dense_parameters_bias_, + ) + first_token_tensor = ( + l_self_modules_pooler_modules_dense_parameters_weight_ + ) = l_self_modules_pooler_modules_dense_parameters_bias_ = None + pooled_output_1 = torch.tanh(pooled_output) + pooled_output = None + return (hidden_states_63, pooled_output_1) diff --git a/samples/transformers-auto-model/muhtasham_medium-vanilla-target-tweet/weight_meta.py b/samples/transformers-auto-model/muhtasham_medium-vanilla-target-tweet/weight_meta.py new file mode 100644 index 000000000..d03c4762c --- /dev/null +++ b/samples/transformers-auto-model/muhtasham_medium-vanilla-target-tweet/weight_meta.py @@ -0,0 +1,1389 @@ +class Program_weight_tensor_meta_L_input_ids_: + name = "L_input_ids_" + shape = [1, 11] + dtype = "torch.int64" + device = "cuda:0" + mean = None + std = None + data = [101, 10629, 7159, 2003, 2109, 2000, 14817, 15078, 19287, 1012, 102] + + +class Program_weight_tensor_meta_L_token_type_ids_: + name = "L_token_type_ids_" + shape = [1, 11] + dtype = "torch.int64" + device = "cuda:0" + mean = None + std = None + data = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] + + +class Program_weight_tensor_meta_L_self_modules_embeddings_buffers_position_ids_: + name = "L_self_modules_embeddings_buffers_position_ids_" + shape = [1, 512] + dtype = "torch.int64" + device = "cuda:0" + mean = None + std = None + min_val = 0 + max_val = 511 + + +class Program_weight_tensor_meta_L_self_modules_embeddings_modules_word_embeddings_parameters_weight_: + name = "L_self_modules_embeddings_modules_word_embeddings_parameters_weight_" + shape = [30522, 512] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_embeddings_modules_token_type_embeddings_parameters_weight_: + name = "L_self_modules_embeddings_modules_token_type_embeddings_parameters_weight_" + shape = [2, 512] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_embeddings_modules_position_embeddings_parameters_weight_: + name = "L_self_modules_embeddings_modules_position_embeddings_parameters_weight_" + shape = [512, 512] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_embeddings_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_embeddings_modules_LayerNorm_parameters_weight_" + shape = [512] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_embeddings_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_embeddings_modules_LayerNorm_parameters_bias_" + shape = [512] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_attention_mask_: + name = "L_attention_mask_" + shape = [1, 11] + dtype = "torch.int64" + device = "cuda:0" + mean = None + std = None + data = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_query_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_query_parameters_weight_" + shape = [512, 512] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_query_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_query_parameters_bias_" + shape = [512] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_key_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_key_parameters_weight_" + shape = [512, 512] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_key_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_key_parameters_bias_" + shape = [512] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_value_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_value_parameters_weight_" + shape = [512, 512] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_value_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_value_parameters_bias_" + shape = [512] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dense_parameters_weight_" + shape = [512, 512] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dense_parameters_bias_" + shape = [512] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_LayerNorm_parameters_weight_" + shape = [512] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_LayerNorm_parameters_bias_" + shape = [512] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_intermediate_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_intermediate_modules_dense_parameters_weight_" + shape = [2048, 512] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_intermediate_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_intermediate_modules_dense_parameters_bias_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dense_parameters_weight_" + shape = [512, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dense_parameters_bias_" + shape = [512] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_LayerNorm_parameters_weight_" + shape = [512] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_LayerNorm_parameters_bias_" + shape = [512] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_query_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_query_parameters_weight_" + shape = [512, 512] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_query_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_query_parameters_bias_" + shape = [512] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_key_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_key_parameters_weight_" + shape = [512, 512] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_key_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_key_parameters_bias_" + shape = [512] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_value_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_value_parameters_weight_" + shape = [512, 512] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_value_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_value_parameters_bias_" + shape = [512] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dense_parameters_weight_" + shape = [512, 512] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dense_parameters_bias_" + shape = [512] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_LayerNorm_parameters_weight_" + shape = [512] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_LayerNorm_parameters_bias_" + shape = [512] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_intermediate_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_intermediate_modules_dense_parameters_weight_" + shape = [2048, 512] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_intermediate_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_intermediate_modules_dense_parameters_bias_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dense_parameters_weight_" + shape = [512, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dense_parameters_bias_" + shape = [512] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_LayerNorm_parameters_weight_" + shape = [512] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_LayerNorm_parameters_bias_" + shape = [512] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_query_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_query_parameters_weight_" + shape = [512, 512] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_query_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_query_parameters_bias_" + shape = [512] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_key_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_key_parameters_weight_" + shape = [512, 512] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_key_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_key_parameters_bias_" + shape = [512] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_value_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_value_parameters_weight_" + shape = [512, 512] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_value_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_value_parameters_bias_" + shape = [512] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_dense_parameters_weight_" + shape = [512, 512] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_dense_parameters_bias_" + shape = [512] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_LayerNorm_parameters_weight_" + shape = [512] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_LayerNorm_parameters_bias_" + shape = [512] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_intermediate_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_intermediate_modules_dense_parameters_weight_" + shape = [2048, 512] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_intermediate_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_intermediate_modules_dense_parameters_bias_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_dense_parameters_weight_" + shape = [512, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_dense_parameters_bias_" + shape = [512] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_LayerNorm_parameters_weight_" + shape = [512] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_LayerNorm_parameters_bias_" + shape = [512] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_query_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_query_parameters_weight_" + shape = [512, 512] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_query_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_query_parameters_bias_" + shape = [512] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_key_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_key_parameters_weight_" + shape = [512, 512] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_key_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_key_parameters_bias_" + shape = [512] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_value_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_value_parameters_weight_" + shape = [512, 512] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_value_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_value_parameters_bias_" + shape = [512] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_dense_parameters_weight_" + shape = [512, 512] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_dense_parameters_bias_" + shape = [512] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_LayerNorm_parameters_weight_" + shape = [512] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_LayerNorm_parameters_bias_" + shape = [512] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_intermediate_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_intermediate_modules_dense_parameters_weight_" + shape = [2048, 512] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_intermediate_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_intermediate_modules_dense_parameters_bias_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_dense_parameters_weight_" + shape = [512, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_dense_parameters_bias_" + shape = [512] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_LayerNorm_parameters_weight_" + shape = [512] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_LayerNorm_parameters_bias_" + shape = [512] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_query_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_query_parameters_weight_" + shape = [512, 512] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_query_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_query_parameters_bias_" + shape = [512] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_key_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_key_parameters_weight_" + shape = [512, 512] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_key_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_key_parameters_bias_" + shape = [512] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_value_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_value_parameters_weight_" + shape = [512, 512] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_value_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_value_parameters_bias_" + shape = [512] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_dense_parameters_weight_" + shape = [512, 512] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_dense_parameters_bias_" + shape = [512] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_LayerNorm_parameters_weight_" + shape = [512] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_LayerNorm_parameters_bias_" + shape = [512] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_4_modules_intermediate_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_4_modules_intermediate_modules_dense_parameters_weight_" + shape = [2048, 512] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_4_modules_intermediate_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_4_modules_intermediate_modules_dense_parameters_bias_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_4_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_4_modules_output_modules_dense_parameters_weight_" + shape = [512, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_4_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_4_modules_output_modules_dense_parameters_bias_" + shape = [512] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_4_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_4_modules_output_modules_LayerNorm_parameters_weight_" + shape = [512] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_4_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_4_modules_output_modules_LayerNorm_parameters_bias_" + shape = [512] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_query_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_query_parameters_weight_" + shape = [512, 512] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_query_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_query_parameters_bias_" + shape = [512] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_key_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_key_parameters_weight_" + shape = [512, 512] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_key_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_key_parameters_bias_" + shape = [512] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_value_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_value_parameters_weight_" + shape = [512, 512] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_value_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_value_parameters_bias_" + shape = [512] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_dense_parameters_weight_" + shape = [512, 512] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_dense_parameters_bias_" + shape = [512] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_LayerNorm_parameters_weight_" + shape = [512] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_LayerNorm_parameters_bias_" + shape = [512] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_5_modules_intermediate_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_5_modules_intermediate_modules_dense_parameters_weight_" + shape = [2048, 512] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_5_modules_intermediate_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_5_modules_intermediate_modules_dense_parameters_bias_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_5_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_5_modules_output_modules_dense_parameters_weight_" + shape = [512, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_5_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_5_modules_output_modules_dense_parameters_bias_" + shape = [512] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_5_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_5_modules_output_modules_LayerNorm_parameters_weight_" + shape = [512] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_5_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_5_modules_output_modules_LayerNorm_parameters_bias_" + shape = [512] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_query_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_query_parameters_weight_" + shape = [512, 512] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_query_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_query_parameters_bias_" + shape = [512] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_key_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_key_parameters_weight_" + shape = [512, 512] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_key_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_key_parameters_bias_" + shape = [512] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_value_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_value_parameters_weight_" + shape = [512, 512] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_value_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_value_parameters_bias_" + shape = [512] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_dense_parameters_weight_" + shape = [512, 512] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_dense_parameters_bias_" + shape = [512] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_LayerNorm_parameters_weight_" + shape = [512] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_LayerNorm_parameters_bias_" + shape = [512] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_6_modules_intermediate_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_6_modules_intermediate_modules_dense_parameters_weight_" + shape = [2048, 512] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_6_modules_intermediate_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_6_modules_intermediate_modules_dense_parameters_bias_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_6_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_6_modules_output_modules_dense_parameters_weight_" + shape = [512, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_6_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_6_modules_output_modules_dense_parameters_bias_" + shape = [512] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_6_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_6_modules_output_modules_LayerNorm_parameters_weight_" + shape = [512] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_6_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_6_modules_output_modules_LayerNorm_parameters_bias_" + shape = [512] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_query_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_query_parameters_weight_" + shape = [512, 512] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_query_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_query_parameters_bias_" + shape = [512] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_key_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_key_parameters_weight_" + shape = [512, 512] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_key_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_key_parameters_bias_" + shape = [512] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_value_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_value_parameters_weight_" + shape = [512, 512] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_value_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_value_parameters_bias_" + shape = [512] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_dense_parameters_weight_" + shape = [512, 512] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_dense_parameters_bias_" + shape = [512] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_LayerNorm_parameters_weight_" + shape = [512] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_LayerNorm_parameters_bias_" + shape = [512] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_7_modules_intermediate_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_7_modules_intermediate_modules_dense_parameters_weight_" + shape = [2048, 512] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_7_modules_intermediate_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_7_modules_intermediate_modules_dense_parameters_bias_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_7_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_7_modules_output_modules_dense_parameters_weight_" + shape = [512, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_7_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_7_modules_output_modules_dense_parameters_bias_" + shape = [512] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_7_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_7_modules_output_modules_LayerNorm_parameters_weight_" + shape = [512] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_7_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_7_modules_output_modules_LayerNorm_parameters_bias_" + shape = [512] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_pooler_modules_dense_parameters_weight_: + name = "L_self_modules_pooler_modules_dense_parameters_weight_" + shape = [512, 512] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_pooler_modules_dense_parameters_bias_: + name = "L_self_modules_pooler_modules_dense_parameters_bias_" + shape = [512] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None diff --git a/samples/transformers-auto-model/muhtasham_mini-vanilla-target-tweet/graph_hash.txt b/samples/transformers-auto-model/muhtasham_mini-vanilla-target-tweet/graph_hash.txt new file mode 100644 index 000000000..c89f61d4f --- /dev/null +++ b/samples/transformers-auto-model/muhtasham_mini-vanilla-target-tweet/graph_hash.txt @@ -0,0 +1 @@ +f7ce2c05d117fed2bf75b7f405d38c3c427ceaccce407eda2f67aa05342c58ea \ No newline at end of file diff --git a/samples/transformers-auto-model/muhtasham_mini-vanilla-target-tweet/graph_net.json b/samples/transformers-auto-model/muhtasham_mini-vanilla-target-tweet/graph_net.json new file mode 100644 index 000000000..b6ffe9f72 --- /dev/null +++ b/samples/transformers-auto-model/muhtasham_mini-vanilla-target-tweet/graph_net.json @@ -0,0 +1,6 @@ +{ + "framework": "torch", + "num_devices_required": 1, + "num_nodes_required": 1, + "dynamic": false +} \ No newline at end of file diff --git a/samples/transformers-auto-model/muhtasham_mini-vanilla-target-tweet/input_meta.py b/samples/transformers-auto-model/muhtasham_mini-vanilla-target-tweet/input_meta.py new file mode 100644 index 000000000..e69de29bb diff --git a/samples/transformers-auto-model/muhtasham_mini-vanilla-target-tweet/input_tensor_constraints.py b/samples/transformers-auto-model/muhtasham_mini-vanilla-target-tweet/input_tensor_constraints.py new file mode 100644 index 000000000..e69de29bb diff --git a/samples/transformers-auto-model/muhtasham_mini-vanilla-target-tweet/model.py b/samples/transformers-auto-model/muhtasham_mini-vanilla-target-tweet/model.py new file mode 100644 index 000000000..f675de83a --- /dev/null +++ b/samples/transformers-auto-model/muhtasham_mini-vanilla-target-tweet/model.py @@ -0,0 +1,620 @@ +import torch + + +class GraphModule(torch.nn.Module): + def forward( + self, + L_input_ids_: torch.Tensor, + L_token_type_ids_: torch.Tensor, + L_self_modules_embeddings_buffers_position_ids_: torch.Tensor, + L_self_modules_embeddings_modules_word_embeddings_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_embeddings_modules_token_type_embeddings_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_embeddings_modules_position_embeddings_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_embeddings_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_embeddings_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_attention_mask_: torch.Tensor, + L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_query_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_query_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_key_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_key_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_intermediate_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_intermediate_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_query_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_query_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_key_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_key_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_intermediate_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_intermediate_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_query_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_query_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_key_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_key_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_intermediate_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_intermediate_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_query_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_query_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_key_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_key_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_intermediate_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_intermediate_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_pooler_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_pooler_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + ): + l_input_ids_ = L_input_ids_ + l_token_type_ids_ = L_token_type_ids_ + l_self_modules_embeddings_buffers_position_ids_ = ( + L_self_modules_embeddings_buffers_position_ids_ + ) + l_self_modules_embeddings_modules_word_embeddings_parameters_weight_ = ( + L_self_modules_embeddings_modules_word_embeddings_parameters_weight_ + ) + l_self_modules_embeddings_modules_token_type_embeddings_parameters_weight_ = ( + L_self_modules_embeddings_modules_token_type_embeddings_parameters_weight_ + ) + l_self_modules_embeddings_modules_position_embeddings_parameters_weight_ = ( + L_self_modules_embeddings_modules_position_embeddings_parameters_weight_ + ) + l_self_modules_embeddings_modules_layer_norm_parameters_weight_ = ( + L_self_modules_embeddings_modules_LayerNorm_parameters_weight_ + ) + l_self_modules_embeddings_modules_layer_norm_parameters_bias_ = ( + L_self_modules_embeddings_modules_LayerNorm_parameters_bias_ + ) + l_attention_mask_ = L_attention_mask_ + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_query_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_query_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_query_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_query_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_key_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_key_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_key_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_key_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_value_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_value_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_value_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_value_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_0_modules_intermediate_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_0_modules_intermediate_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_0_modules_intermediate_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_0_modules_intermediate_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_0_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_0_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_query_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_query_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_query_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_query_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_key_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_key_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_key_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_key_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_value_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_value_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_value_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_value_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_1_modules_intermediate_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_1_modules_intermediate_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_1_modules_intermediate_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_1_modules_intermediate_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_1_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_1_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_query_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_query_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_query_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_query_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_key_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_key_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_key_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_key_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_value_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_value_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_value_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_value_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_2_modules_intermediate_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_2_modules_intermediate_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_2_modules_intermediate_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_2_modules_intermediate_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_2_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_2_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_2_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_2_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_query_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_query_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_query_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_query_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_key_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_key_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_key_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_key_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_value_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_value_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_value_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_value_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_3_modules_intermediate_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_3_modules_intermediate_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_3_modules_intermediate_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_3_modules_intermediate_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_3_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_3_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_3_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_3_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_pooler_modules_dense_parameters_weight_ = ( + L_self_modules_pooler_modules_dense_parameters_weight_ + ) + l_self_modules_pooler_modules_dense_parameters_bias_ = ( + L_self_modules_pooler_modules_dense_parameters_bias_ + ) + position_ids = l_self_modules_embeddings_buffers_position_ids_[ + (slice(None, None, None), slice(0, 11, None)) + ] + l_self_modules_embeddings_buffers_position_ids_ = None + inputs_embeds = torch.nn.functional.embedding( + l_input_ids_, + l_self_modules_embeddings_modules_word_embeddings_parameters_weight_, + 0, + None, + 2.0, + False, + False, + ) + l_input_ids_ = ( + l_self_modules_embeddings_modules_word_embeddings_parameters_weight_ + ) = None + token_type_embeddings = torch.nn.functional.embedding( + l_token_type_ids_, + l_self_modules_embeddings_modules_token_type_embeddings_parameters_weight_, + None, + None, + 2.0, + False, + False, + ) + l_token_type_ids_ = ( + l_self_modules_embeddings_modules_token_type_embeddings_parameters_weight_ + ) = None + embeddings = inputs_embeds + token_type_embeddings + inputs_embeds = token_type_embeddings = None + position_embeddings = torch.nn.functional.embedding( + position_ids, + l_self_modules_embeddings_modules_position_embeddings_parameters_weight_, + None, + None, + 2.0, + False, + False, + ) + position_ids = ( + l_self_modules_embeddings_modules_position_embeddings_parameters_weight_ + ) = None + embeddings += position_embeddings + embeddings_1 = embeddings + embeddings = position_embeddings = None + embeddings_2 = torch.nn.functional.layer_norm( + embeddings_1, + (256,), + l_self_modules_embeddings_modules_layer_norm_parameters_weight_, + l_self_modules_embeddings_modules_layer_norm_parameters_bias_, + 1e-12, + ) + embeddings_1 = ( + l_self_modules_embeddings_modules_layer_norm_parameters_weight_ + ) = l_self_modules_embeddings_modules_layer_norm_parameters_bias_ = None + embeddings_3 = torch.nn.functional.dropout(embeddings_2, 0.1, False, False) + embeddings_2 = None + getitem_1 = l_attention_mask_[ + (slice(None, None, None), None, None, slice(None, None, None)) + ] + l_attention_mask_ = None + expand = getitem_1.expand(1, 1, 11, 11) + getitem_1 = None + expanded_mask = expand.to(torch.float32) + expand = None + tensor = torch.tensor(1.0, dtype=torch.float32) + inverted_mask = tensor - expanded_mask + tensor = expanded_mask = None + to_1 = inverted_mask.to(torch.bool) + extended_attention_mask = inverted_mask.masked_fill( + to_1, -3.4028234663852886e38 + ) + inverted_mask = to_1 = None + linear = torch._C._nn.linear( + embeddings_3, + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_query_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_query_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_query_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_query_parameters_bias_ = (None) + view = linear.view(1, -1, 4, 64) + linear = None + query_layer = view.transpose(1, 2) + view = None + linear_1 = torch._C._nn.linear( + embeddings_3, + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_key_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_key_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_key_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_key_parameters_bias_ = (None) + view_1 = linear_1.view(1, -1, 4, 64) + linear_1 = None + key_layer = view_1.transpose(1, 2) + view_1 = None + linear_2 = torch._C._nn.linear( + embeddings_3, + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_value_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_value_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_value_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_value_parameters_bias_ = (None) + view_2 = linear_2.view(1, -1, 4, 64) + linear_2 = None + value_layer = view_2.transpose(1, 2) + view_2 = None + attn_output = torch._C._nn.scaled_dot_product_attention( + query_layer, + key_layer, + value_layer, + attn_mask=extended_attention_mask, + dropout_p=0.0, + is_causal=False, + ) + query_layer = key_layer = value_layer = None + attn_output_1 = attn_output.transpose(1, 2) + attn_output = None + attn_output_2 = attn_output_1.reshape(1, 11, 256) + attn_output_1 = None + hidden_states = torch._C._nn.linear( + attn_output_2, + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dense_parameters_bias_, + ) + attn_output_2 = l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_1 = torch.nn.functional.dropout(hidden_states, 0.1, False, False) + hidden_states = None + add_1 = hidden_states_1 + embeddings_3 + hidden_states_1 = embeddings_3 = None + hidden_states_2 = torch.nn.functional.layer_norm( + add_1, + (256,), + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_layer_norm_parameters_bias_, + 1e-12, + ) + add_1 = l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = (None) + hidden_states_3 = torch._C._nn.linear( + hidden_states_2, + l_self_modules_encoder_modules_layer_modules_0_modules_intermediate_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_0_modules_intermediate_modules_dense_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_0_modules_intermediate_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_0_modules_intermediate_modules_dense_parameters_bias_ = (None) + hidden_states_4 = torch._C._nn.gelu(hidden_states_3) + hidden_states_3 = None + hidden_states_5 = torch._C._nn.linear( + hidden_states_4, + l_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dense_parameters_bias_, + ) + hidden_states_4 = l_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_6 = torch.nn.functional.dropout( + hidden_states_5, 0.1, False, False + ) + hidden_states_5 = None + add_2 = hidden_states_6 + hidden_states_2 + hidden_states_6 = hidden_states_2 = None + hidden_states_7 = torch.nn.functional.layer_norm( + add_2, + (256,), + l_self_modules_encoder_modules_layer_modules_0_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_0_modules_output_modules_layer_norm_parameters_bias_, + 1e-12, + ) + add_2 = l_self_modules_encoder_modules_layer_modules_0_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_0_modules_output_modules_layer_norm_parameters_bias_ = (None) + linear_6 = torch._C._nn.linear( + hidden_states_7, + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_query_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_query_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_query_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_query_parameters_bias_ = (None) + view_3 = linear_6.view(1, -1, 4, 64) + linear_6 = None + query_layer_1 = view_3.transpose(1, 2) + view_3 = None + linear_7 = torch._C._nn.linear( + hidden_states_7, + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_key_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_key_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_key_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_key_parameters_bias_ = (None) + view_4 = linear_7.view(1, -1, 4, 64) + linear_7 = None + key_layer_1 = view_4.transpose(1, 2) + view_4 = None + linear_8 = torch._C._nn.linear( + hidden_states_7, + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_value_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_value_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_value_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_value_parameters_bias_ = (None) + view_5 = linear_8.view(1, -1, 4, 64) + linear_8 = None + value_layer_1 = view_5.transpose(1, 2) + view_5 = None + attn_output_3 = torch._C._nn.scaled_dot_product_attention( + query_layer_1, + key_layer_1, + value_layer_1, + attn_mask=extended_attention_mask, + dropout_p=0.0, + is_causal=False, + ) + query_layer_1 = key_layer_1 = value_layer_1 = None + attn_output_4 = attn_output_3.transpose(1, 2) + attn_output_3 = None + attn_output_5 = attn_output_4.reshape(1, 11, 256) + attn_output_4 = None + hidden_states_8 = torch._C._nn.linear( + attn_output_5, + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dense_parameters_bias_, + ) + attn_output_5 = l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_9 = torch.nn.functional.dropout( + hidden_states_8, 0.1, False, False + ) + hidden_states_8 = None + add_3 = hidden_states_9 + hidden_states_7 + hidden_states_9 = hidden_states_7 = None + hidden_states_10 = torch.nn.functional.layer_norm( + add_3, + (256,), + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_layer_norm_parameters_bias_, + 1e-12, + ) + add_3 = l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = (None) + hidden_states_11 = torch._C._nn.linear( + hidden_states_10, + l_self_modules_encoder_modules_layer_modules_1_modules_intermediate_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_1_modules_intermediate_modules_dense_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_1_modules_intermediate_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_1_modules_intermediate_modules_dense_parameters_bias_ = (None) + hidden_states_12 = torch._C._nn.gelu(hidden_states_11) + hidden_states_11 = None + hidden_states_13 = torch._C._nn.linear( + hidden_states_12, + l_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dense_parameters_bias_, + ) + hidden_states_12 = l_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_14 = torch.nn.functional.dropout( + hidden_states_13, 0.1, False, False + ) + hidden_states_13 = None + add_4 = hidden_states_14 + hidden_states_10 + hidden_states_14 = hidden_states_10 = None + hidden_states_15 = torch.nn.functional.layer_norm( + add_4, + (256,), + l_self_modules_encoder_modules_layer_modules_1_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_1_modules_output_modules_layer_norm_parameters_bias_, + 1e-12, + ) + add_4 = l_self_modules_encoder_modules_layer_modules_1_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_1_modules_output_modules_layer_norm_parameters_bias_ = (None) + linear_12 = torch._C._nn.linear( + hidden_states_15, + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_query_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_query_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_query_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_query_parameters_bias_ = (None) + view_6 = linear_12.view(1, -1, 4, 64) + linear_12 = None + query_layer_2 = view_6.transpose(1, 2) + view_6 = None + linear_13 = torch._C._nn.linear( + hidden_states_15, + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_key_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_key_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_key_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_key_parameters_bias_ = (None) + view_7 = linear_13.view(1, -1, 4, 64) + linear_13 = None + key_layer_2 = view_7.transpose(1, 2) + view_7 = None + linear_14 = torch._C._nn.linear( + hidden_states_15, + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_value_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_value_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_value_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_value_parameters_bias_ = (None) + view_8 = linear_14.view(1, -1, 4, 64) + linear_14 = None + value_layer_2 = view_8.transpose(1, 2) + view_8 = None + attn_output_6 = torch._C._nn.scaled_dot_product_attention( + query_layer_2, + key_layer_2, + value_layer_2, + attn_mask=extended_attention_mask, + dropout_p=0.0, + is_causal=False, + ) + query_layer_2 = key_layer_2 = value_layer_2 = None + attn_output_7 = attn_output_6.transpose(1, 2) + attn_output_6 = None + attn_output_8 = attn_output_7.reshape(1, 11, 256) + attn_output_7 = None + hidden_states_16 = torch._C._nn.linear( + attn_output_8, + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_dense_parameters_bias_, + ) + attn_output_8 = l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_17 = torch.nn.functional.dropout( + hidden_states_16, 0.1, False, False + ) + hidden_states_16 = None + add_5 = hidden_states_17 + hidden_states_15 + hidden_states_17 = hidden_states_15 = None + hidden_states_18 = torch.nn.functional.layer_norm( + add_5, + (256,), + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_layer_norm_parameters_bias_, + 1e-12, + ) + add_5 = l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = (None) + hidden_states_19 = torch._C._nn.linear( + hidden_states_18, + l_self_modules_encoder_modules_layer_modules_2_modules_intermediate_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_2_modules_intermediate_modules_dense_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_2_modules_intermediate_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_2_modules_intermediate_modules_dense_parameters_bias_ = (None) + hidden_states_20 = torch._C._nn.gelu(hidden_states_19) + hidden_states_19 = None + hidden_states_21 = torch._C._nn.linear( + hidden_states_20, + l_self_modules_encoder_modules_layer_modules_2_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_2_modules_output_modules_dense_parameters_bias_, + ) + hidden_states_20 = l_self_modules_encoder_modules_layer_modules_2_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_2_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_22 = torch.nn.functional.dropout( + hidden_states_21, 0.1, False, False + ) + hidden_states_21 = None + add_6 = hidden_states_22 + hidden_states_18 + hidden_states_22 = hidden_states_18 = None + hidden_states_23 = torch.nn.functional.layer_norm( + add_6, + (256,), + l_self_modules_encoder_modules_layer_modules_2_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_2_modules_output_modules_layer_norm_parameters_bias_, + 1e-12, + ) + add_6 = l_self_modules_encoder_modules_layer_modules_2_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_2_modules_output_modules_layer_norm_parameters_bias_ = (None) + linear_18 = torch._C._nn.linear( + hidden_states_23, + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_query_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_query_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_query_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_query_parameters_bias_ = (None) + view_9 = linear_18.view(1, -1, 4, 64) + linear_18 = None + query_layer_3 = view_9.transpose(1, 2) + view_9 = None + linear_19 = torch._C._nn.linear( + hidden_states_23, + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_key_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_key_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_key_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_key_parameters_bias_ = (None) + view_10 = linear_19.view(1, -1, 4, 64) + linear_19 = None + key_layer_3 = view_10.transpose(1, 2) + view_10 = None + linear_20 = torch._C._nn.linear( + hidden_states_23, + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_value_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_value_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_value_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_value_parameters_bias_ = (None) + view_11 = linear_20.view(1, -1, 4, 64) + linear_20 = None + value_layer_3 = view_11.transpose(1, 2) + view_11 = None + attn_output_9 = torch._C._nn.scaled_dot_product_attention( + query_layer_3, + key_layer_3, + value_layer_3, + attn_mask=extended_attention_mask, + dropout_p=0.0, + is_causal=False, + ) + query_layer_3 = key_layer_3 = value_layer_3 = extended_attention_mask = None + attn_output_10 = attn_output_9.transpose(1, 2) + attn_output_9 = None + attn_output_11 = attn_output_10.reshape(1, 11, 256) + attn_output_10 = None + hidden_states_24 = torch._C._nn.linear( + attn_output_11, + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_dense_parameters_bias_, + ) + attn_output_11 = l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_25 = torch.nn.functional.dropout( + hidden_states_24, 0.1, False, False + ) + hidden_states_24 = None + add_7 = hidden_states_25 + hidden_states_23 + hidden_states_25 = hidden_states_23 = None + hidden_states_26 = torch.nn.functional.layer_norm( + add_7, + (256,), + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_layer_norm_parameters_bias_, + 1e-12, + ) + add_7 = l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = (None) + hidden_states_27 = torch._C._nn.linear( + hidden_states_26, + l_self_modules_encoder_modules_layer_modules_3_modules_intermediate_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_3_modules_intermediate_modules_dense_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_3_modules_intermediate_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_3_modules_intermediate_modules_dense_parameters_bias_ = (None) + hidden_states_28 = torch._C._nn.gelu(hidden_states_27) + hidden_states_27 = None + hidden_states_29 = torch._C._nn.linear( + hidden_states_28, + l_self_modules_encoder_modules_layer_modules_3_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_3_modules_output_modules_dense_parameters_bias_, + ) + hidden_states_28 = l_self_modules_encoder_modules_layer_modules_3_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_3_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_30 = torch.nn.functional.dropout( + hidden_states_29, 0.1, False, False + ) + hidden_states_29 = None + add_8 = hidden_states_30 + hidden_states_26 + hidden_states_30 = hidden_states_26 = None + hidden_states_31 = torch.nn.functional.layer_norm( + add_8, + (256,), + l_self_modules_encoder_modules_layer_modules_3_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_3_modules_output_modules_layer_norm_parameters_bias_, + 1e-12, + ) + add_8 = l_self_modules_encoder_modules_layer_modules_3_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_3_modules_output_modules_layer_norm_parameters_bias_ = (None) + first_token_tensor = hidden_states_31[(slice(None, None, None), 0)] + pooled_output = torch._C._nn.linear( + first_token_tensor, + l_self_modules_pooler_modules_dense_parameters_weight_, + l_self_modules_pooler_modules_dense_parameters_bias_, + ) + first_token_tensor = ( + l_self_modules_pooler_modules_dense_parameters_weight_ + ) = l_self_modules_pooler_modules_dense_parameters_bias_ = None + pooled_output_1 = torch.tanh(pooled_output) + pooled_output = None + return (hidden_states_31, pooled_output_1) diff --git a/samples/transformers-auto-model/muhtasham_mini-vanilla-target-tweet/weight_meta.py b/samples/transformers-auto-model/muhtasham_mini-vanilla-target-tweet/weight_meta.py new file mode 100644 index 000000000..ad7d8a86b --- /dev/null +++ b/samples/transformers-auto-model/muhtasham_mini-vanilla-target-tweet/weight_meta.py @@ -0,0 +1,749 @@ +class Program_weight_tensor_meta_L_input_ids_: + name = "L_input_ids_" + shape = [1, 11] + dtype = "torch.int64" + device = "cuda:0" + mean = None + std = None + data = [101, 10629, 7159, 2003, 2109, 2000, 14817, 15078, 19287, 1012, 102] + + +class Program_weight_tensor_meta_L_token_type_ids_: + name = "L_token_type_ids_" + shape = [1, 11] + dtype = "torch.int64" + device = "cuda:0" + mean = None + std = None + data = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] + + +class Program_weight_tensor_meta_L_self_modules_embeddings_buffers_position_ids_: + name = "L_self_modules_embeddings_buffers_position_ids_" + shape = [1, 512] + dtype = "torch.int64" + device = "cuda:0" + mean = None + std = None + min_val = 0 + max_val = 511 + + +class Program_weight_tensor_meta_L_self_modules_embeddings_modules_word_embeddings_parameters_weight_: + name = "L_self_modules_embeddings_modules_word_embeddings_parameters_weight_" + shape = [30522, 256] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_embeddings_modules_token_type_embeddings_parameters_weight_: + name = "L_self_modules_embeddings_modules_token_type_embeddings_parameters_weight_" + shape = [2, 256] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_embeddings_modules_position_embeddings_parameters_weight_: + name = "L_self_modules_embeddings_modules_position_embeddings_parameters_weight_" + shape = [512, 256] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_embeddings_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_embeddings_modules_LayerNorm_parameters_weight_" + shape = [256] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_embeddings_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_embeddings_modules_LayerNorm_parameters_bias_" + shape = [256] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_attention_mask_: + name = "L_attention_mask_" + shape = [1, 11] + dtype = "torch.int64" + device = "cuda:0" + mean = None + std = None + data = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_query_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_query_parameters_weight_" + shape = [256, 256] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_query_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_query_parameters_bias_" + shape = [256] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_key_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_key_parameters_weight_" + shape = [256, 256] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_key_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_key_parameters_bias_" + shape = [256] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_value_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_value_parameters_weight_" + shape = [256, 256] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_value_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_value_parameters_bias_" + shape = [256] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dense_parameters_weight_" + shape = [256, 256] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dense_parameters_bias_" + shape = [256] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_LayerNorm_parameters_weight_" + shape = [256] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_LayerNorm_parameters_bias_" + shape = [256] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_intermediate_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_intermediate_modules_dense_parameters_weight_" + shape = [1024, 256] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_intermediate_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_intermediate_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dense_parameters_weight_" + shape = [256, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dense_parameters_bias_" + shape = [256] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_LayerNorm_parameters_weight_" + shape = [256] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_LayerNorm_parameters_bias_" + shape = [256] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_query_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_query_parameters_weight_" + shape = [256, 256] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_query_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_query_parameters_bias_" + shape = [256] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_key_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_key_parameters_weight_" + shape = [256, 256] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_key_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_key_parameters_bias_" + shape = [256] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_value_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_value_parameters_weight_" + shape = [256, 256] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_value_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_value_parameters_bias_" + shape = [256] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dense_parameters_weight_" + shape = [256, 256] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dense_parameters_bias_" + shape = [256] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_LayerNorm_parameters_weight_" + shape = [256] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_LayerNorm_parameters_bias_" + shape = [256] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_intermediate_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_intermediate_modules_dense_parameters_weight_" + shape = [1024, 256] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_intermediate_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_intermediate_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dense_parameters_weight_" + shape = [256, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dense_parameters_bias_" + shape = [256] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_LayerNorm_parameters_weight_" + shape = [256] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_LayerNorm_parameters_bias_" + shape = [256] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_query_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_query_parameters_weight_" + shape = [256, 256] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_query_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_query_parameters_bias_" + shape = [256] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_key_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_key_parameters_weight_" + shape = [256, 256] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_key_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_key_parameters_bias_" + shape = [256] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_value_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_value_parameters_weight_" + shape = [256, 256] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_value_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_value_parameters_bias_" + shape = [256] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_dense_parameters_weight_" + shape = [256, 256] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_dense_parameters_bias_" + shape = [256] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_LayerNorm_parameters_weight_" + shape = [256] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_LayerNorm_parameters_bias_" + shape = [256] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_intermediate_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_intermediate_modules_dense_parameters_weight_" + shape = [1024, 256] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_intermediate_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_intermediate_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_dense_parameters_weight_" + shape = [256, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_dense_parameters_bias_" + shape = [256] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_LayerNorm_parameters_weight_" + shape = [256] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_LayerNorm_parameters_bias_" + shape = [256] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_query_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_query_parameters_weight_" + shape = [256, 256] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_query_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_query_parameters_bias_" + shape = [256] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_key_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_key_parameters_weight_" + shape = [256, 256] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_key_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_key_parameters_bias_" + shape = [256] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_value_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_value_parameters_weight_" + shape = [256, 256] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_value_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_value_parameters_bias_" + shape = [256] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_dense_parameters_weight_" + shape = [256, 256] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_dense_parameters_bias_" + shape = [256] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_LayerNorm_parameters_weight_" + shape = [256] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_LayerNorm_parameters_bias_" + shape = [256] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_intermediate_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_intermediate_modules_dense_parameters_weight_" + shape = [1024, 256] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_intermediate_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_intermediate_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_dense_parameters_weight_" + shape = [256, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_dense_parameters_bias_" + shape = [256] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_LayerNorm_parameters_weight_" + shape = [256] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_LayerNorm_parameters_bias_" + shape = [256] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_pooler_modules_dense_parameters_weight_: + name = "L_self_modules_pooler_modules_dense_parameters_weight_" + shape = [256, 256] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_pooler_modules_dense_parameters_bias_: + name = "L_self_modules_pooler_modules_dense_parameters_bias_" + shape = [256] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None diff --git a/samples/transformers-auto-model/muhtasham_small-vanilla-target-tweet/graph_hash.txt b/samples/transformers-auto-model/muhtasham_small-vanilla-target-tweet/graph_hash.txt new file mode 100644 index 000000000..8cfcd54e2 --- /dev/null +++ b/samples/transformers-auto-model/muhtasham_small-vanilla-target-tweet/graph_hash.txt @@ -0,0 +1 @@ +f7d58b8c26ccce3c6d3460801e4e9529159d4865c15f8b0b535f693b539a7649 \ No newline at end of file diff --git a/samples/transformers-auto-model/muhtasham_small-vanilla-target-tweet/graph_net.json b/samples/transformers-auto-model/muhtasham_small-vanilla-target-tweet/graph_net.json new file mode 100644 index 000000000..b6ffe9f72 --- /dev/null +++ b/samples/transformers-auto-model/muhtasham_small-vanilla-target-tweet/graph_net.json @@ -0,0 +1,6 @@ +{ + "framework": "torch", + "num_devices_required": 1, + "num_nodes_required": 1, + "dynamic": false +} \ No newline at end of file diff --git a/samples/transformers-auto-model/muhtasham_small-vanilla-target-tweet/input_meta.py b/samples/transformers-auto-model/muhtasham_small-vanilla-target-tweet/input_meta.py new file mode 100644 index 000000000..e69de29bb diff --git a/samples/transformers-auto-model/muhtasham_small-vanilla-target-tweet/input_tensor_constraints.py b/samples/transformers-auto-model/muhtasham_small-vanilla-target-tweet/input_tensor_constraints.py new file mode 100644 index 000000000..e69de29bb diff --git a/samples/transformers-auto-model/muhtasham_small-vanilla-target-tweet/model.py b/samples/transformers-auto-model/muhtasham_small-vanilla-target-tweet/model.py new file mode 100644 index 000000000..f0893e276 --- /dev/null +++ b/samples/transformers-auto-model/muhtasham_small-vanilla-target-tweet/model.py @@ -0,0 +1,620 @@ +import torch + + +class GraphModule(torch.nn.Module): + def forward( + self, + L_input_ids_: torch.Tensor, + L_token_type_ids_: torch.Tensor, + L_self_modules_embeddings_buffers_position_ids_: torch.Tensor, + L_self_modules_embeddings_modules_word_embeddings_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_embeddings_modules_token_type_embeddings_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_embeddings_modules_position_embeddings_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_embeddings_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_embeddings_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_attention_mask_: torch.Tensor, + L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_query_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_query_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_key_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_key_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_intermediate_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_intermediate_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_query_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_query_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_key_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_key_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_intermediate_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_intermediate_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_query_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_query_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_key_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_key_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_intermediate_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_intermediate_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_query_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_query_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_key_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_key_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_intermediate_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_intermediate_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_pooler_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_pooler_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + ): + l_input_ids_ = L_input_ids_ + l_token_type_ids_ = L_token_type_ids_ + l_self_modules_embeddings_buffers_position_ids_ = ( + L_self_modules_embeddings_buffers_position_ids_ + ) + l_self_modules_embeddings_modules_word_embeddings_parameters_weight_ = ( + L_self_modules_embeddings_modules_word_embeddings_parameters_weight_ + ) + l_self_modules_embeddings_modules_token_type_embeddings_parameters_weight_ = ( + L_self_modules_embeddings_modules_token_type_embeddings_parameters_weight_ + ) + l_self_modules_embeddings_modules_position_embeddings_parameters_weight_ = ( + L_self_modules_embeddings_modules_position_embeddings_parameters_weight_ + ) + l_self_modules_embeddings_modules_layer_norm_parameters_weight_ = ( + L_self_modules_embeddings_modules_LayerNorm_parameters_weight_ + ) + l_self_modules_embeddings_modules_layer_norm_parameters_bias_ = ( + L_self_modules_embeddings_modules_LayerNorm_parameters_bias_ + ) + l_attention_mask_ = L_attention_mask_ + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_query_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_query_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_query_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_query_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_key_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_key_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_key_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_key_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_value_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_value_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_value_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_value_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_0_modules_intermediate_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_0_modules_intermediate_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_0_modules_intermediate_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_0_modules_intermediate_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_0_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_0_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_query_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_query_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_query_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_query_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_key_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_key_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_key_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_key_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_value_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_value_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_value_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_value_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_1_modules_intermediate_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_1_modules_intermediate_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_1_modules_intermediate_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_1_modules_intermediate_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_1_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_1_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_query_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_query_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_query_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_query_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_key_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_key_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_key_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_key_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_value_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_value_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_value_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_value_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_2_modules_intermediate_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_2_modules_intermediate_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_2_modules_intermediate_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_2_modules_intermediate_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_2_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_2_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_2_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_2_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_query_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_query_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_query_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_query_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_key_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_key_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_key_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_key_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_value_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_value_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_value_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_value_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_3_modules_intermediate_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_3_modules_intermediate_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_3_modules_intermediate_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_3_modules_intermediate_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_3_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_3_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_3_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_3_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_pooler_modules_dense_parameters_weight_ = ( + L_self_modules_pooler_modules_dense_parameters_weight_ + ) + l_self_modules_pooler_modules_dense_parameters_bias_ = ( + L_self_modules_pooler_modules_dense_parameters_bias_ + ) + position_ids = l_self_modules_embeddings_buffers_position_ids_[ + (slice(None, None, None), slice(0, 11, None)) + ] + l_self_modules_embeddings_buffers_position_ids_ = None + inputs_embeds = torch.nn.functional.embedding( + l_input_ids_, + l_self_modules_embeddings_modules_word_embeddings_parameters_weight_, + 0, + None, + 2.0, + False, + False, + ) + l_input_ids_ = ( + l_self_modules_embeddings_modules_word_embeddings_parameters_weight_ + ) = None + token_type_embeddings = torch.nn.functional.embedding( + l_token_type_ids_, + l_self_modules_embeddings_modules_token_type_embeddings_parameters_weight_, + None, + None, + 2.0, + False, + False, + ) + l_token_type_ids_ = ( + l_self_modules_embeddings_modules_token_type_embeddings_parameters_weight_ + ) = None + embeddings = inputs_embeds + token_type_embeddings + inputs_embeds = token_type_embeddings = None + position_embeddings = torch.nn.functional.embedding( + position_ids, + l_self_modules_embeddings_modules_position_embeddings_parameters_weight_, + None, + None, + 2.0, + False, + False, + ) + position_ids = ( + l_self_modules_embeddings_modules_position_embeddings_parameters_weight_ + ) = None + embeddings += position_embeddings + embeddings_1 = embeddings + embeddings = position_embeddings = None + embeddings_2 = torch.nn.functional.layer_norm( + embeddings_1, + (512,), + l_self_modules_embeddings_modules_layer_norm_parameters_weight_, + l_self_modules_embeddings_modules_layer_norm_parameters_bias_, + 1e-12, + ) + embeddings_1 = ( + l_self_modules_embeddings_modules_layer_norm_parameters_weight_ + ) = l_self_modules_embeddings_modules_layer_norm_parameters_bias_ = None + embeddings_3 = torch.nn.functional.dropout(embeddings_2, 0.1, False, False) + embeddings_2 = None + getitem_1 = l_attention_mask_[ + (slice(None, None, None), None, None, slice(None, None, None)) + ] + l_attention_mask_ = None + expand = getitem_1.expand(1, 1, 11, 11) + getitem_1 = None + expanded_mask = expand.to(torch.float32) + expand = None + tensor = torch.tensor(1.0, dtype=torch.float32) + inverted_mask = tensor - expanded_mask + tensor = expanded_mask = None + to_1 = inverted_mask.to(torch.bool) + extended_attention_mask = inverted_mask.masked_fill( + to_1, -3.4028234663852886e38 + ) + inverted_mask = to_1 = None + linear = torch._C._nn.linear( + embeddings_3, + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_query_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_query_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_query_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_query_parameters_bias_ = (None) + view = linear.view(1, -1, 8, 64) + linear = None + query_layer = view.transpose(1, 2) + view = None + linear_1 = torch._C._nn.linear( + embeddings_3, + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_key_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_key_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_key_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_key_parameters_bias_ = (None) + view_1 = linear_1.view(1, -1, 8, 64) + linear_1 = None + key_layer = view_1.transpose(1, 2) + view_1 = None + linear_2 = torch._C._nn.linear( + embeddings_3, + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_value_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_value_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_value_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_value_parameters_bias_ = (None) + view_2 = linear_2.view(1, -1, 8, 64) + linear_2 = None + value_layer = view_2.transpose(1, 2) + view_2 = None + attn_output = torch._C._nn.scaled_dot_product_attention( + query_layer, + key_layer, + value_layer, + attn_mask=extended_attention_mask, + dropout_p=0.0, + is_causal=False, + ) + query_layer = key_layer = value_layer = None + attn_output_1 = attn_output.transpose(1, 2) + attn_output = None + attn_output_2 = attn_output_1.reshape(1, 11, 512) + attn_output_1 = None + hidden_states = torch._C._nn.linear( + attn_output_2, + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dense_parameters_bias_, + ) + attn_output_2 = l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_1 = torch.nn.functional.dropout(hidden_states, 0.1, False, False) + hidden_states = None + add_1 = hidden_states_1 + embeddings_3 + hidden_states_1 = embeddings_3 = None + hidden_states_2 = torch.nn.functional.layer_norm( + add_1, + (512,), + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_layer_norm_parameters_bias_, + 1e-12, + ) + add_1 = l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = (None) + hidden_states_3 = torch._C._nn.linear( + hidden_states_2, + l_self_modules_encoder_modules_layer_modules_0_modules_intermediate_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_0_modules_intermediate_modules_dense_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_0_modules_intermediate_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_0_modules_intermediate_modules_dense_parameters_bias_ = (None) + hidden_states_4 = torch._C._nn.gelu(hidden_states_3) + hidden_states_3 = None + hidden_states_5 = torch._C._nn.linear( + hidden_states_4, + l_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dense_parameters_bias_, + ) + hidden_states_4 = l_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_6 = torch.nn.functional.dropout( + hidden_states_5, 0.1, False, False + ) + hidden_states_5 = None + add_2 = hidden_states_6 + hidden_states_2 + hidden_states_6 = hidden_states_2 = None + hidden_states_7 = torch.nn.functional.layer_norm( + add_2, + (512,), + l_self_modules_encoder_modules_layer_modules_0_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_0_modules_output_modules_layer_norm_parameters_bias_, + 1e-12, + ) + add_2 = l_self_modules_encoder_modules_layer_modules_0_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_0_modules_output_modules_layer_norm_parameters_bias_ = (None) + linear_6 = torch._C._nn.linear( + hidden_states_7, + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_query_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_query_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_query_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_query_parameters_bias_ = (None) + view_3 = linear_6.view(1, -1, 8, 64) + linear_6 = None + query_layer_1 = view_3.transpose(1, 2) + view_3 = None + linear_7 = torch._C._nn.linear( + hidden_states_7, + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_key_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_key_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_key_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_key_parameters_bias_ = (None) + view_4 = linear_7.view(1, -1, 8, 64) + linear_7 = None + key_layer_1 = view_4.transpose(1, 2) + view_4 = None + linear_8 = torch._C._nn.linear( + hidden_states_7, + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_value_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_value_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_value_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_value_parameters_bias_ = (None) + view_5 = linear_8.view(1, -1, 8, 64) + linear_8 = None + value_layer_1 = view_5.transpose(1, 2) + view_5 = None + attn_output_3 = torch._C._nn.scaled_dot_product_attention( + query_layer_1, + key_layer_1, + value_layer_1, + attn_mask=extended_attention_mask, + dropout_p=0.0, + is_causal=False, + ) + query_layer_1 = key_layer_1 = value_layer_1 = None + attn_output_4 = attn_output_3.transpose(1, 2) + attn_output_3 = None + attn_output_5 = attn_output_4.reshape(1, 11, 512) + attn_output_4 = None + hidden_states_8 = torch._C._nn.linear( + attn_output_5, + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dense_parameters_bias_, + ) + attn_output_5 = l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_9 = torch.nn.functional.dropout( + hidden_states_8, 0.1, False, False + ) + hidden_states_8 = None + add_3 = hidden_states_9 + hidden_states_7 + hidden_states_9 = hidden_states_7 = None + hidden_states_10 = torch.nn.functional.layer_norm( + add_3, + (512,), + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_layer_norm_parameters_bias_, + 1e-12, + ) + add_3 = l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = (None) + hidden_states_11 = torch._C._nn.linear( + hidden_states_10, + l_self_modules_encoder_modules_layer_modules_1_modules_intermediate_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_1_modules_intermediate_modules_dense_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_1_modules_intermediate_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_1_modules_intermediate_modules_dense_parameters_bias_ = (None) + hidden_states_12 = torch._C._nn.gelu(hidden_states_11) + hidden_states_11 = None + hidden_states_13 = torch._C._nn.linear( + hidden_states_12, + l_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dense_parameters_bias_, + ) + hidden_states_12 = l_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_14 = torch.nn.functional.dropout( + hidden_states_13, 0.1, False, False + ) + hidden_states_13 = None + add_4 = hidden_states_14 + hidden_states_10 + hidden_states_14 = hidden_states_10 = None + hidden_states_15 = torch.nn.functional.layer_norm( + add_4, + (512,), + l_self_modules_encoder_modules_layer_modules_1_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_1_modules_output_modules_layer_norm_parameters_bias_, + 1e-12, + ) + add_4 = l_self_modules_encoder_modules_layer_modules_1_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_1_modules_output_modules_layer_norm_parameters_bias_ = (None) + linear_12 = torch._C._nn.linear( + hidden_states_15, + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_query_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_query_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_query_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_query_parameters_bias_ = (None) + view_6 = linear_12.view(1, -1, 8, 64) + linear_12 = None + query_layer_2 = view_6.transpose(1, 2) + view_6 = None + linear_13 = torch._C._nn.linear( + hidden_states_15, + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_key_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_key_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_key_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_key_parameters_bias_ = (None) + view_7 = linear_13.view(1, -1, 8, 64) + linear_13 = None + key_layer_2 = view_7.transpose(1, 2) + view_7 = None + linear_14 = torch._C._nn.linear( + hidden_states_15, + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_value_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_value_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_value_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_value_parameters_bias_ = (None) + view_8 = linear_14.view(1, -1, 8, 64) + linear_14 = None + value_layer_2 = view_8.transpose(1, 2) + view_8 = None + attn_output_6 = torch._C._nn.scaled_dot_product_attention( + query_layer_2, + key_layer_2, + value_layer_2, + attn_mask=extended_attention_mask, + dropout_p=0.0, + is_causal=False, + ) + query_layer_2 = key_layer_2 = value_layer_2 = None + attn_output_7 = attn_output_6.transpose(1, 2) + attn_output_6 = None + attn_output_8 = attn_output_7.reshape(1, 11, 512) + attn_output_7 = None + hidden_states_16 = torch._C._nn.linear( + attn_output_8, + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_dense_parameters_bias_, + ) + attn_output_8 = l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_17 = torch.nn.functional.dropout( + hidden_states_16, 0.1, False, False + ) + hidden_states_16 = None + add_5 = hidden_states_17 + hidden_states_15 + hidden_states_17 = hidden_states_15 = None + hidden_states_18 = torch.nn.functional.layer_norm( + add_5, + (512,), + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_layer_norm_parameters_bias_, + 1e-12, + ) + add_5 = l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = (None) + hidden_states_19 = torch._C._nn.linear( + hidden_states_18, + l_self_modules_encoder_modules_layer_modules_2_modules_intermediate_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_2_modules_intermediate_modules_dense_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_2_modules_intermediate_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_2_modules_intermediate_modules_dense_parameters_bias_ = (None) + hidden_states_20 = torch._C._nn.gelu(hidden_states_19) + hidden_states_19 = None + hidden_states_21 = torch._C._nn.linear( + hidden_states_20, + l_self_modules_encoder_modules_layer_modules_2_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_2_modules_output_modules_dense_parameters_bias_, + ) + hidden_states_20 = l_self_modules_encoder_modules_layer_modules_2_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_2_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_22 = torch.nn.functional.dropout( + hidden_states_21, 0.1, False, False + ) + hidden_states_21 = None + add_6 = hidden_states_22 + hidden_states_18 + hidden_states_22 = hidden_states_18 = None + hidden_states_23 = torch.nn.functional.layer_norm( + add_6, + (512,), + l_self_modules_encoder_modules_layer_modules_2_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_2_modules_output_modules_layer_norm_parameters_bias_, + 1e-12, + ) + add_6 = l_self_modules_encoder_modules_layer_modules_2_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_2_modules_output_modules_layer_norm_parameters_bias_ = (None) + linear_18 = torch._C._nn.linear( + hidden_states_23, + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_query_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_query_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_query_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_query_parameters_bias_ = (None) + view_9 = linear_18.view(1, -1, 8, 64) + linear_18 = None + query_layer_3 = view_9.transpose(1, 2) + view_9 = None + linear_19 = torch._C._nn.linear( + hidden_states_23, + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_key_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_key_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_key_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_key_parameters_bias_ = (None) + view_10 = linear_19.view(1, -1, 8, 64) + linear_19 = None + key_layer_3 = view_10.transpose(1, 2) + view_10 = None + linear_20 = torch._C._nn.linear( + hidden_states_23, + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_value_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_value_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_value_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_value_parameters_bias_ = (None) + view_11 = linear_20.view(1, -1, 8, 64) + linear_20 = None + value_layer_3 = view_11.transpose(1, 2) + view_11 = None + attn_output_9 = torch._C._nn.scaled_dot_product_attention( + query_layer_3, + key_layer_3, + value_layer_3, + attn_mask=extended_attention_mask, + dropout_p=0.0, + is_causal=False, + ) + query_layer_3 = key_layer_3 = value_layer_3 = extended_attention_mask = None + attn_output_10 = attn_output_9.transpose(1, 2) + attn_output_9 = None + attn_output_11 = attn_output_10.reshape(1, 11, 512) + attn_output_10 = None + hidden_states_24 = torch._C._nn.linear( + attn_output_11, + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_dense_parameters_bias_, + ) + attn_output_11 = l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_25 = torch.nn.functional.dropout( + hidden_states_24, 0.1, False, False + ) + hidden_states_24 = None + add_7 = hidden_states_25 + hidden_states_23 + hidden_states_25 = hidden_states_23 = None + hidden_states_26 = torch.nn.functional.layer_norm( + add_7, + (512,), + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_layer_norm_parameters_bias_, + 1e-12, + ) + add_7 = l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = (None) + hidden_states_27 = torch._C._nn.linear( + hidden_states_26, + l_self_modules_encoder_modules_layer_modules_3_modules_intermediate_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_3_modules_intermediate_modules_dense_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_3_modules_intermediate_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_3_modules_intermediate_modules_dense_parameters_bias_ = (None) + hidden_states_28 = torch._C._nn.gelu(hidden_states_27) + hidden_states_27 = None + hidden_states_29 = torch._C._nn.linear( + hidden_states_28, + l_self_modules_encoder_modules_layer_modules_3_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_3_modules_output_modules_dense_parameters_bias_, + ) + hidden_states_28 = l_self_modules_encoder_modules_layer_modules_3_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_3_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_30 = torch.nn.functional.dropout( + hidden_states_29, 0.1, False, False + ) + hidden_states_29 = None + add_8 = hidden_states_30 + hidden_states_26 + hidden_states_30 = hidden_states_26 = None + hidden_states_31 = torch.nn.functional.layer_norm( + add_8, + (512,), + l_self_modules_encoder_modules_layer_modules_3_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_3_modules_output_modules_layer_norm_parameters_bias_, + 1e-12, + ) + add_8 = l_self_modules_encoder_modules_layer_modules_3_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_3_modules_output_modules_layer_norm_parameters_bias_ = (None) + first_token_tensor = hidden_states_31[(slice(None, None, None), 0)] + pooled_output = torch._C._nn.linear( + first_token_tensor, + l_self_modules_pooler_modules_dense_parameters_weight_, + l_self_modules_pooler_modules_dense_parameters_bias_, + ) + first_token_tensor = ( + l_self_modules_pooler_modules_dense_parameters_weight_ + ) = l_self_modules_pooler_modules_dense_parameters_bias_ = None + pooled_output_1 = torch.tanh(pooled_output) + pooled_output = None + return (hidden_states_31, pooled_output_1) diff --git a/samples/transformers-auto-model/muhtasham_small-vanilla-target-tweet/weight_meta.py b/samples/transformers-auto-model/muhtasham_small-vanilla-target-tweet/weight_meta.py new file mode 100644 index 000000000..4358be721 --- /dev/null +++ b/samples/transformers-auto-model/muhtasham_small-vanilla-target-tweet/weight_meta.py @@ -0,0 +1,749 @@ +class Program_weight_tensor_meta_L_input_ids_: + name = "L_input_ids_" + shape = [1, 11] + dtype = "torch.int64" + device = "cuda:0" + mean = None + std = None + data = [101, 10629, 7159, 2003, 2109, 2000, 14817, 15078, 19287, 1012, 102] + + +class Program_weight_tensor_meta_L_token_type_ids_: + name = "L_token_type_ids_" + shape = [1, 11] + dtype = "torch.int64" + device = "cuda:0" + mean = None + std = None + data = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] + + +class Program_weight_tensor_meta_L_self_modules_embeddings_buffers_position_ids_: + name = "L_self_modules_embeddings_buffers_position_ids_" + shape = [1, 512] + dtype = "torch.int64" + device = "cuda:0" + mean = None + std = None + min_val = 0 + max_val = 511 + + +class Program_weight_tensor_meta_L_self_modules_embeddings_modules_word_embeddings_parameters_weight_: + name = "L_self_modules_embeddings_modules_word_embeddings_parameters_weight_" + shape = [30522, 512] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_embeddings_modules_token_type_embeddings_parameters_weight_: + name = "L_self_modules_embeddings_modules_token_type_embeddings_parameters_weight_" + shape = [2, 512] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_embeddings_modules_position_embeddings_parameters_weight_: + name = "L_self_modules_embeddings_modules_position_embeddings_parameters_weight_" + shape = [512, 512] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_embeddings_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_embeddings_modules_LayerNorm_parameters_weight_" + shape = [512] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_embeddings_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_embeddings_modules_LayerNorm_parameters_bias_" + shape = [512] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_attention_mask_: + name = "L_attention_mask_" + shape = [1, 11] + dtype = "torch.int64" + device = "cuda:0" + mean = None + std = None + data = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_query_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_query_parameters_weight_" + shape = [512, 512] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_query_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_query_parameters_bias_" + shape = [512] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_key_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_key_parameters_weight_" + shape = [512, 512] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_key_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_key_parameters_bias_" + shape = [512] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_value_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_value_parameters_weight_" + shape = [512, 512] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_value_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_value_parameters_bias_" + shape = [512] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dense_parameters_weight_" + shape = [512, 512] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dense_parameters_bias_" + shape = [512] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_LayerNorm_parameters_weight_" + shape = [512] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_LayerNorm_parameters_bias_" + shape = [512] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_intermediate_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_intermediate_modules_dense_parameters_weight_" + shape = [2048, 512] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_intermediate_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_intermediate_modules_dense_parameters_bias_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dense_parameters_weight_" + shape = [512, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dense_parameters_bias_" + shape = [512] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_LayerNorm_parameters_weight_" + shape = [512] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_LayerNorm_parameters_bias_" + shape = [512] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_query_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_query_parameters_weight_" + shape = [512, 512] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_query_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_query_parameters_bias_" + shape = [512] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_key_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_key_parameters_weight_" + shape = [512, 512] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_key_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_key_parameters_bias_" + shape = [512] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_value_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_value_parameters_weight_" + shape = [512, 512] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_value_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_value_parameters_bias_" + shape = [512] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dense_parameters_weight_" + shape = [512, 512] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dense_parameters_bias_" + shape = [512] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_LayerNorm_parameters_weight_" + shape = [512] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_LayerNorm_parameters_bias_" + shape = [512] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_intermediate_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_intermediate_modules_dense_parameters_weight_" + shape = [2048, 512] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_intermediate_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_intermediate_modules_dense_parameters_bias_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dense_parameters_weight_" + shape = [512, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dense_parameters_bias_" + shape = [512] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_LayerNorm_parameters_weight_" + shape = [512] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_LayerNorm_parameters_bias_" + shape = [512] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_query_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_query_parameters_weight_" + shape = [512, 512] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_query_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_query_parameters_bias_" + shape = [512] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_key_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_key_parameters_weight_" + shape = [512, 512] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_key_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_key_parameters_bias_" + shape = [512] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_value_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_value_parameters_weight_" + shape = [512, 512] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_value_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_value_parameters_bias_" + shape = [512] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_dense_parameters_weight_" + shape = [512, 512] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_dense_parameters_bias_" + shape = [512] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_LayerNorm_parameters_weight_" + shape = [512] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_LayerNorm_parameters_bias_" + shape = [512] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_intermediate_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_intermediate_modules_dense_parameters_weight_" + shape = [2048, 512] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_intermediate_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_intermediate_modules_dense_parameters_bias_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_dense_parameters_weight_" + shape = [512, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_dense_parameters_bias_" + shape = [512] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_LayerNorm_parameters_weight_" + shape = [512] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_LayerNorm_parameters_bias_" + shape = [512] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_query_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_query_parameters_weight_" + shape = [512, 512] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_query_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_query_parameters_bias_" + shape = [512] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_key_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_key_parameters_weight_" + shape = [512, 512] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_key_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_key_parameters_bias_" + shape = [512] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_value_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_value_parameters_weight_" + shape = [512, 512] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_value_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_value_parameters_bias_" + shape = [512] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_dense_parameters_weight_" + shape = [512, 512] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_dense_parameters_bias_" + shape = [512] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_LayerNorm_parameters_weight_" + shape = [512] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_LayerNorm_parameters_bias_" + shape = [512] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_intermediate_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_intermediate_modules_dense_parameters_weight_" + shape = [2048, 512] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_intermediate_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_intermediate_modules_dense_parameters_bias_" + shape = [2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_dense_parameters_weight_" + shape = [512, 2048] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_dense_parameters_bias_" + shape = [512] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_LayerNorm_parameters_weight_" + shape = [512] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_LayerNorm_parameters_bias_" + shape = [512] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_pooler_modules_dense_parameters_weight_: + name = "L_self_modules_pooler_modules_dense_parameters_weight_" + shape = [512, 512] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_pooler_modules_dense_parameters_bias_: + name = "L_self_modules_pooler_modules_dense_parameters_bias_" + shape = [512] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None diff --git a/samples/transformers-auto-model/ogoshi2000_stance-nystromformer/graph_hash.txt b/samples/transformers-auto-model/ogoshi2000_stance-nystromformer/graph_hash.txt new file mode 100644 index 000000000..2bf68aa62 --- /dev/null +++ b/samples/transformers-auto-model/ogoshi2000_stance-nystromformer/graph_hash.txt @@ -0,0 +1 @@ +4c98032cfd4c816917e7f0486aefce24f49459c427b235555091b1effec466e9 \ No newline at end of file diff --git a/samples/transformers-auto-model/ogoshi2000_stance-nystromformer/graph_net.json b/samples/transformers-auto-model/ogoshi2000_stance-nystromformer/graph_net.json new file mode 100644 index 000000000..b6ffe9f72 --- /dev/null +++ b/samples/transformers-auto-model/ogoshi2000_stance-nystromformer/graph_net.json @@ -0,0 +1,6 @@ +{ + "framework": "torch", + "num_devices_required": 1, + "num_nodes_required": 1, + "dynamic": false +} \ No newline at end of file diff --git a/samples/transformers-auto-model/ogoshi2000_stance-nystromformer/input_meta.py b/samples/transformers-auto-model/ogoshi2000_stance-nystromformer/input_meta.py new file mode 100644 index 000000000..e69de29bb diff --git a/samples/transformers-auto-model/ogoshi2000_stance-nystromformer/input_tensor_constraints.py b/samples/transformers-auto-model/ogoshi2000_stance-nystromformer/input_tensor_constraints.py new file mode 100644 index 000000000..e69de29bb diff --git a/samples/transformers-auto-model/ogoshi2000_stance-nystromformer/model.py b/samples/transformers-auto-model/ogoshi2000_stance-nystromformer/model.py new file mode 100644 index 000000000..d0aa0980e --- /dev/null +++ b/samples/transformers-auto-model/ogoshi2000_stance-nystromformer/model.py @@ -0,0 +1,1851 @@ +import torch + + +class GraphModule(torch.nn.Module): + def forward( + self, + L_input_ids_: torch.Tensor, + L_attention_mask_: torch.Tensor, + L_self_modules_embeddings_buffers_token_type_ids_: torch.Tensor, + L_self_modules_embeddings_modules_word_embeddings_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_embeddings_buffers_position_ids_: torch.Tensor, + L_self_modules_embeddings_modules_token_type_embeddings_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_embeddings_modules_position_embeddings_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_embeddings_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_embeddings_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_query_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_query_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_key_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_key_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_conv_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_intermediate_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_intermediate_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_query_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_query_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_key_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_key_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_conv_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_intermediate_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_intermediate_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_query_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_query_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_key_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_key_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_conv_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_intermediate_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_intermediate_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_query_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_query_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_key_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_key_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_conv_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_intermediate_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_intermediate_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_query_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_query_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_key_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_key_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_conv_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_4_modules_intermediate_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_4_modules_intermediate_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_4_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_4_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_4_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_4_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_query_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_query_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_key_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_key_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_conv_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_5_modules_intermediate_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_5_modules_intermediate_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_5_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_5_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_5_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_5_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_query_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_query_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_key_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_key_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_conv_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_6_modules_intermediate_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_6_modules_intermediate_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_6_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_6_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_6_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_6_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_query_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_query_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_key_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_key_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_conv_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_7_modules_intermediate_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_7_modules_intermediate_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_7_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_7_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_7_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_7_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_query_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_query_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_key_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_key_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_conv_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_8_modules_intermediate_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_8_modules_intermediate_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_8_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_8_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_8_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_8_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_query_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_query_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_key_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_key_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_conv_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_9_modules_intermediate_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_9_modules_intermediate_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_9_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_9_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_9_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_9_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_query_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_query_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_key_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_key_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_conv_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_10_modules_intermediate_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_10_modules_intermediate_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_10_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_10_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_10_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_10_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_query_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_query_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_key_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_key_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_conv_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_11_modules_intermediate_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_11_modules_intermediate_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_11_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_11_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_11_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_11_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + ): + l_input_ids_ = L_input_ids_ + l_attention_mask_ = L_attention_mask_ + l_self_modules_embeddings_buffers_token_type_ids_ = ( + L_self_modules_embeddings_buffers_token_type_ids_ + ) + l_self_modules_embeddings_modules_word_embeddings_parameters_weight_ = ( + L_self_modules_embeddings_modules_word_embeddings_parameters_weight_ + ) + l_self_modules_embeddings_buffers_position_ids_ = ( + L_self_modules_embeddings_buffers_position_ids_ + ) + l_self_modules_embeddings_modules_token_type_embeddings_parameters_weight_ = ( + L_self_modules_embeddings_modules_token_type_embeddings_parameters_weight_ + ) + l_self_modules_embeddings_modules_position_embeddings_parameters_weight_ = ( + L_self_modules_embeddings_modules_position_embeddings_parameters_weight_ + ) + l_self_modules_embeddings_modules_layer_norm_parameters_weight_ = ( + L_self_modules_embeddings_modules_LayerNorm_parameters_weight_ + ) + l_self_modules_embeddings_modules_layer_norm_parameters_bias_ = ( + L_self_modules_embeddings_modules_LayerNorm_parameters_bias_ + ) + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_query_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_query_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_query_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_query_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_key_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_key_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_key_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_key_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_value_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_value_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_value_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_value_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_conv_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_conv_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_0_modules_intermediate_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_0_modules_intermediate_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_0_modules_intermediate_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_0_modules_intermediate_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_0_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_0_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_query_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_query_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_query_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_query_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_key_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_key_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_key_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_key_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_value_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_value_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_value_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_value_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_conv_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_conv_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_1_modules_intermediate_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_1_modules_intermediate_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_1_modules_intermediate_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_1_modules_intermediate_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_1_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_1_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_query_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_query_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_query_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_query_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_key_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_key_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_key_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_key_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_value_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_value_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_value_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_value_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_conv_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_conv_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_2_modules_intermediate_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_2_modules_intermediate_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_2_modules_intermediate_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_2_modules_intermediate_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_2_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_2_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_2_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_2_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_query_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_query_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_query_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_query_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_key_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_key_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_key_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_key_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_value_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_value_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_value_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_value_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_conv_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_conv_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_3_modules_intermediate_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_3_modules_intermediate_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_3_modules_intermediate_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_3_modules_intermediate_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_3_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_3_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_3_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_3_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_query_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_query_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_query_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_query_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_key_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_key_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_key_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_key_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_value_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_value_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_value_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_value_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_conv_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_conv_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_4_modules_intermediate_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_4_modules_intermediate_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_4_modules_intermediate_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_4_modules_intermediate_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_4_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_4_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_4_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_4_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_4_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_4_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_4_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_4_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_query_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_query_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_query_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_query_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_key_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_key_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_key_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_key_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_value_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_value_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_value_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_value_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_conv_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_conv_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_5_modules_intermediate_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_5_modules_intermediate_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_5_modules_intermediate_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_5_modules_intermediate_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_5_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_5_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_5_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_5_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_5_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_5_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_5_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_5_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_query_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_query_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_query_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_query_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_key_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_key_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_key_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_key_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_value_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_value_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_value_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_value_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_conv_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_conv_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_6_modules_intermediate_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_6_modules_intermediate_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_6_modules_intermediate_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_6_modules_intermediate_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_6_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_6_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_6_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_6_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_6_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_6_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_6_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_6_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_query_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_query_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_query_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_query_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_key_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_key_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_key_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_key_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_value_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_value_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_value_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_value_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_conv_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_conv_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_7_modules_intermediate_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_7_modules_intermediate_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_7_modules_intermediate_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_7_modules_intermediate_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_7_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_7_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_7_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_7_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_7_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_7_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_7_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_7_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_query_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_query_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_query_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_query_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_key_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_key_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_key_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_key_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_value_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_value_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_value_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_value_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_conv_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_conv_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_8_modules_intermediate_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_8_modules_intermediate_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_8_modules_intermediate_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_8_modules_intermediate_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_8_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_8_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_8_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_8_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_8_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_8_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_8_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_8_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_query_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_query_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_query_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_query_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_key_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_key_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_key_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_key_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_value_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_value_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_value_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_value_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_conv_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_conv_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_9_modules_intermediate_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_9_modules_intermediate_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_9_modules_intermediate_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_9_modules_intermediate_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_9_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_9_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_9_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_9_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_9_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_9_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_9_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_9_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_query_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_query_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_query_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_query_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_key_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_key_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_key_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_key_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_value_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_value_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_value_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_value_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_conv_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_conv_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_10_modules_intermediate_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_10_modules_intermediate_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_10_modules_intermediate_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_10_modules_intermediate_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_10_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_10_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_10_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_10_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_10_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_10_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_10_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_10_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_query_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_query_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_query_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_query_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_key_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_key_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_key_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_key_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_value_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_value_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_value_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_value_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_conv_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_conv_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_11_modules_intermediate_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_11_modules_intermediate_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_11_modules_intermediate_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_11_modules_intermediate_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_11_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_11_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_11_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_11_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_11_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_11_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_11_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_11_modules_output_modules_LayerNorm_parameters_bias_ + buffered_token_type_ids = l_self_modules_embeddings_buffers_token_type_ids_[ + (slice(None, None, None), slice(None, 16, None)) + ] + l_self_modules_embeddings_buffers_token_type_ids_ = None + buffered_token_type_ids_expanded = buffered_token_type_ids.expand(1, 16) + buffered_token_type_ids = None + extended_attention_mask = l_attention_mask_[ + (slice(None, None, None), None, None, slice(None, None, None)) + ] + l_attention_mask_ = None + extended_attention_mask_1 = extended_attention_mask.to(dtype=torch.float32) + extended_attention_mask = None + sub = 1.0 - extended_attention_mask_1 + extended_attention_mask_1 = None + extended_attention_mask_2 = sub * -3.4028234663852886e38 + sub = None + position_ids = l_self_modules_embeddings_buffers_position_ids_[ + (slice(None, None, None), slice(None, 16, None)) + ] + l_self_modules_embeddings_buffers_position_ids_ = None + inputs_embeds = torch.nn.functional.embedding( + l_input_ids_, + l_self_modules_embeddings_modules_word_embeddings_parameters_weight_, + 1, + None, + 2.0, + False, + False, + ) + l_input_ids_ = ( + l_self_modules_embeddings_modules_word_embeddings_parameters_weight_ + ) = None + token_type_embeddings = torch.nn.functional.embedding( + buffered_token_type_ids_expanded, + l_self_modules_embeddings_modules_token_type_embeddings_parameters_weight_, + None, + None, + 2.0, + False, + False, + ) + buffered_token_type_ids_expanded = ( + l_self_modules_embeddings_modules_token_type_embeddings_parameters_weight_ + ) = None + embeddings = inputs_embeds + token_type_embeddings + inputs_embeds = token_type_embeddings = None + position_embeddings = torch.nn.functional.embedding( + position_ids, + l_self_modules_embeddings_modules_position_embeddings_parameters_weight_, + None, + None, + 2.0, + False, + False, + ) + position_ids = ( + l_self_modules_embeddings_modules_position_embeddings_parameters_weight_ + ) = None + embeddings += position_embeddings + embeddings_1 = embeddings + embeddings = position_embeddings = None + embeddings_2 = torch.nn.functional.layer_norm( + embeddings_1, + (768,), + l_self_modules_embeddings_modules_layer_norm_parameters_weight_, + l_self_modules_embeddings_modules_layer_norm_parameters_bias_, + 1e-05, + ) + embeddings_1 = ( + l_self_modules_embeddings_modules_layer_norm_parameters_weight_ + ) = l_self_modules_embeddings_modules_layer_norm_parameters_bias_ = None + embeddings_3 = torch.nn.functional.dropout(embeddings_2, 0.1, False, False) + embeddings_2 = None + linear = torch._C._nn.linear( + embeddings_3, + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_query_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_query_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_query_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_query_parameters_bias_ = (None) + view = linear.view(1, -1, 12, 64) + linear = None + query_layer = view.transpose(1, 2) + view = None + linear_1 = torch._C._nn.linear( + embeddings_3, + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_key_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_key_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_key_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_key_parameters_bias_ = (None) + view_1 = linear_1.view(1, -1, 12, 64) + linear_1 = None + key_layer = view_1.transpose(1, 2) + view_1 = None + linear_2 = torch._C._nn.linear( + embeddings_3, + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_value_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_value_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_value_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_value_parameters_bias_ = (None) + view_2 = linear_2.view(1, -1, 12, 64) + linear_2 = None + value_layer = view_2.transpose(1, 2) + view_2 = None + query_layer_1 = query_layer / 2.8284271247461903 + query_layer = None + key_layer_1 = key_layer / 2.8284271247461903 + key_layer = None + transpose_3 = key_layer_1.transpose(-1, -2) + key_layer_1 = None + attention_scores = torch.matmul(query_layer_1, transpose_3) + query_layer_1 = transpose_3 = None + attention_scores_1 = attention_scores + extended_attention_mask_2 + attention_scores = None + attention_probs = torch.nn.functional.softmax(attention_scores_1, dim=-1) + attention_scores_1 = None + context_layer = torch.matmul(attention_probs, value_layer) + attention_probs = None + conv2d = torch.conv2d( + value_layer, + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_conv_parameters_weight_, + None, + (1, 1), + (32, 0), + (1, 1), + 12, + ) + value_layer = l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_conv_parameters_weight_ = (None) + context_layer += conv2d + context_layer_1 = context_layer + context_layer = conv2d = None + permute = context_layer_1.permute(0, 2, 1, 3) + context_layer_1 = None + context_layer_2 = permute.contiguous() + permute = None + context_layer_3 = context_layer_2.view(1, 16, 768) + context_layer_2 = None + hidden_states = torch._C._nn.linear( + context_layer_3, + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dense_parameters_bias_, + ) + context_layer_3 = l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_1 = torch.nn.functional.dropout(hidden_states, 0.1, False, False) + hidden_states = None + add_2 = hidden_states_1 + embeddings_3 + hidden_states_1 = embeddings_3 = None + hidden_states_2 = torch.nn.functional.layer_norm( + add_2, + (768,), + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_layer_norm_parameters_bias_, + 1e-05, + ) + add_2 = l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = (None) + hidden_states_3 = torch._C._nn.linear( + hidden_states_2, + l_self_modules_encoder_modules_layer_modules_0_modules_intermediate_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_0_modules_intermediate_modules_dense_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_0_modules_intermediate_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_0_modules_intermediate_modules_dense_parameters_bias_ = (None) + hidden_states_4 = torch._C._nn.gelu(hidden_states_3) + hidden_states_3 = None + hidden_states_5 = torch._C._nn.linear( + hidden_states_4, + l_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dense_parameters_bias_, + ) + hidden_states_4 = l_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_6 = torch.nn.functional.dropout( + hidden_states_5, 0.1, False, False + ) + hidden_states_5 = None + add_3 = hidden_states_6 + hidden_states_2 + hidden_states_6 = hidden_states_2 = None + hidden_states_7 = torch.nn.functional.layer_norm( + add_3, + (768,), + l_self_modules_encoder_modules_layer_modules_0_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_0_modules_output_modules_layer_norm_parameters_bias_, + 1e-05, + ) + add_3 = l_self_modules_encoder_modules_layer_modules_0_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_0_modules_output_modules_layer_norm_parameters_bias_ = (None) + linear_6 = torch._C._nn.linear( + hidden_states_7, + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_query_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_query_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_query_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_query_parameters_bias_ = (None) + view_4 = linear_6.view(1, -1, 12, 64) + linear_6 = None + query_layer_2 = view_4.transpose(1, 2) + view_4 = None + linear_7 = torch._C._nn.linear( + hidden_states_7, + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_key_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_key_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_key_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_key_parameters_bias_ = (None) + view_5 = linear_7.view(1, -1, 12, 64) + linear_7 = None + key_layer_2 = view_5.transpose(1, 2) + view_5 = None + linear_8 = torch._C._nn.linear( + hidden_states_7, + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_value_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_value_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_value_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_value_parameters_bias_ = (None) + view_6 = linear_8.view(1, -1, 12, 64) + linear_8 = None + value_layer_1 = view_6.transpose(1, 2) + view_6 = None + query_layer_3 = query_layer_2 / 2.8284271247461903 + query_layer_2 = None + key_layer_3 = key_layer_2 / 2.8284271247461903 + key_layer_2 = None + transpose_7 = key_layer_3.transpose(-1, -2) + key_layer_3 = None + attention_scores_2 = torch.matmul(query_layer_3, transpose_7) + query_layer_3 = transpose_7 = None + attention_scores_3 = attention_scores_2 + extended_attention_mask_2 + attention_scores_2 = None + attention_probs_1 = torch.nn.functional.softmax(attention_scores_3, dim=-1) + attention_scores_3 = None + context_layer_4 = torch.matmul(attention_probs_1, value_layer_1) + attention_probs_1 = None + conv2d_1 = torch.conv2d( + value_layer_1, + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_conv_parameters_weight_, + None, + (1, 1), + (32, 0), + (1, 1), + 12, + ) + value_layer_1 = l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_conv_parameters_weight_ = (None) + context_layer_4 += conv2d_1 + context_layer_5 = context_layer_4 + context_layer_4 = conv2d_1 = None + permute_1 = context_layer_5.permute(0, 2, 1, 3) + context_layer_5 = None + context_layer_6 = permute_1.contiguous() + permute_1 = None + context_layer_7 = context_layer_6.view(1, 16, 768) + context_layer_6 = None + hidden_states_8 = torch._C._nn.linear( + context_layer_7, + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dense_parameters_bias_, + ) + context_layer_7 = l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_9 = torch.nn.functional.dropout( + hidden_states_8, 0.1, False, False + ) + hidden_states_8 = None + add_5 = hidden_states_9 + hidden_states_7 + hidden_states_9 = hidden_states_7 = None + hidden_states_10 = torch.nn.functional.layer_norm( + add_5, + (768,), + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_layer_norm_parameters_bias_, + 1e-05, + ) + add_5 = l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = (None) + hidden_states_11 = torch._C._nn.linear( + hidden_states_10, + l_self_modules_encoder_modules_layer_modules_1_modules_intermediate_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_1_modules_intermediate_modules_dense_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_1_modules_intermediate_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_1_modules_intermediate_modules_dense_parameters_bias_ = (None) + hidden_states_12 = torch._C._nn.gelu(hidden_states_11) + hidden_states_11 = None + hidden_states_13 = torch._C._nn.linear( + hidden_states_12, + l_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dense_parameters_bias_, + ) + hidden_states_12 = l_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_14 = torch.nn.functional.dropout( + hidden_states_13, 0.1, False, False + ) + hidden_states_13 = None + add_6 = hidden_states_14 + hidden_states_10 + hidden_states_14 = hidden_states_10 = None + hidden_states_15 = torch.nn.functional.layer_norm( + add_6, + (768,), + l_self_modules_encoder_modules_layer_modules_1_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_1_modules_output_modules_layer_norm_parameters_bias_, + 1e-05, + ) + add_6 = l_self_modules_encoder_modules_layer_modules_1_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_1_modules_output_modules_layer_norm_parameters_bias_ = (None) + linear_12 = torch._C._nn.linear( + hidden_states_15, + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_query_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_query_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_query_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_query_parameters_bias_ = (None) + view_8 = linear_12.view(1, -1, 12, 64) + linear_12 = None + query_layer_4 = view_8.transpose(1, 2) + view_8 = None + linear_13 = torch._C._nn.linear( + hidden_states_15, + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_key_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_key_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_key_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_key_parameters_bias_ = (None) + view_9 = linear_13.view(1, -1, 12, 64) + linear_13 = None + key_layer_4 = view_9.transpose(1, 2) + view_9 = None + linear_14 = torch._C._nn.linear( + hidden_states_15, + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_value_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_value_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_value_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_value_parameters_bias_ = (None) + view_10 = linear_14.view(1, -1, 12, 64) + linear_14 = None + value_layer_2 = view_10.transpose(1, 2) + view_10 = None + query_layer_5 = query_layer_4 / 2.8284271247461903 + query_layer_4 = None + key_layer_5 = key_layer_4 / 2.8284271247461903 + key_layer_4 = None + transpose_11 = key_layer_5.transpose(-1, -2) + key_layer_5 = None + attention_scores_4 = torch.matmul(query_layer_5, transpose_11) + query_layer_5 = transpose_11 = None + attention_scores_5 = attention_scores_4 + extended_attention_mask_2 + attention_scores_4 = None + attention_probs_2 = torch.nn.functional.softmax(attention_scores_5, dim=-1) + attention_scores_5 = None + context_layer_8 = torch.matmul(attention_probs_2, value_layer_2) + attention_probs_2 = None + conv2d_2 = torch.conv2d( + value_layer_2, + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_conv_parameters_weight_, + None, + (1, 1), + (32, 0), + (1, 1), + 12, + ) + value_layer_2 = l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_conv_parameters_weight_ = (None) + context_layer_8 += conv2d_2 + context_layer_9 = context_layer_8 + context_layer_8 = conv2d_2 = None + permute_2 = context_layer_9.permute(0, 2, 1, 3) + context_layer_9 = None + context_layer_10 = permute_2.contiguous() + permute_2 = None + context_layer_11 = context_layer_10.view(1, 16, 768) + context_layer_10 = None + hidden_states_16 = torch._C._nn.linear( + context_layer_11, + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_dense_parameters_bias_, + ) + context_layer_11 = l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_17 = torch.nn.functional.dropout( + hidden_states_16, 0.1, False, False + ) + hidden_states_16 = None + add_8 = hidden_states_17 + hidden_states_15 + hidden_states_17 = hidden_states_15 = None + hidden_states_18 = torch.nn.functional.layer_norm( + add_8, + (768,), + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_layer_norm_parameters_bias_, + 1e-05, + ) + add_8 = l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = (None) + hidden_states_19 = torch._C._nn.linear( + hidden_states_18, + l_self_modules_encoder_modules_layer_modules_2_modules_intermediate_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_2_modules_intermediate_modules_dense_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_2_modules_intermediate_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_2_modules_intermediate_modules_dense_parameters_bias_ = (None) + hidden_states_20 = torch._C._nn.gelu(hidden_states_19) + hidden_states_19 = None + hidden_states_21 = torch._C._nn.linear( + hidden_states_20, + l_self_modules_encoder_modules_layer_modules_2_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_2_modules_output_modules_dense_parameters_bias_, + ) + hidden_states_20 = l_self_modules_encoder_modules_layer_modules_2_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_2_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_22 = torch.nn.functional.dropout( + hidden_states_21, 0.1, False, False + ) + hidden_states_21 = None + add_9 = hidden_states_22 + hidden_states_18 + hidden_states_22 = hidden_states_18 = None + hidden_states_23 = torch.nn.functional.layer_norm( + add_9, + (768,), + l_self_modules_encoder_modules_layer_modules_2_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_2_modules_output_modules_layer_norm_parameters_bias_, + 1e-05, + ) + add_9 = l_self_modules_encoder_modules_layer_modules_2_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_2_modules_output_modules_layer_norm_parameters_bias_ = (None) + linear_18 = torch._C._nn.linear( + hidden_states_23, + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_query_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_query_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_query_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_query_parameters_bias_ = (None) + view_12 = linear_18.view(1, -1, 12, 64) + linear_18 = None + query_layer_6 = view_12.transpose(1, 2) + view_12 = None + linear_19 = torch._C._nn.linear( + hidden_states_23, + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_key_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_key_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_key_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_key_parameters_bias_ = (None) + view_13 = linear_19.view(1, -1, 12, 64) + linear_19 = None + key_layer_6 = view_13.transpose(1, 2) + view_13 = None + linear_20 = torch._C._nn.linear( + hidden_states_23, + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_value_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_value_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_value_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_value_parameters_bias_ = (None) + view_14 = linear_20.view(1, -1, 12, 64) + linear_20 = None + value_layer_3 = view_14.transpose(1, 2) + view_14 = None + query_layer_7 = query_layer_6 / 2.8284271247461903 + query_layer_6 = None + key_layer_7 = key_layer_6 / 2.8284271247461903 + key_layer_6 = None + transpose_15 = key_layer_7.transpose(-1, -2) + key_layer_7 = None + attention_scores_6 = torch.matmul(query_layer_7, transpose_15) + query_layer_7 = transpose_15 = None + attention_scores_7 = attention_scores_6 + extended_attention_mask_2 + attention_scores_6 = None + attention_probs_3 = torch.nn.functional.softmax(attention_scores_7, dim=-1) + attention_scores_7 = None + context_layer_12 = torch.matmul(attention_probs_3, value_layer_3) + attention_probs_3 = None + conv2d_3 = torch.conv2d( + value_layer_3, + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_conv_parameters_weight_, + None, + (1, 1), + (32, 0), + (1, 1), + 12, + ) + value_layer_3 = l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_conv_parameters_weight_ = (None) + context_layer_12 += conv2d_3 + context_layer_13 = context_layer_12 + context_layer_12 = conv2d_3 = None + permute_3 = context_layer_13.permute(0, 2, 1, 3) + context_layer_13 = None + context_layer_14 = permute_3.contiguous() + permute_3 = None + context_layer_15 = context_layer_14.view(1, 16, 768) + context_layer_14 = None + hidden_states_24 = torch._C._nn.linear( + context_layer_15, + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_dense_parameters_bias_, + ) + context_layer_15 = l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_25 = torch.nn.functional.dropout( + hidden_states_24, 0.1, False, False + ) + hidden_states_24 = None + add_11 = hidden_states_25 + hidden_states_23 + hidden_states_25 = hidden_states_23 = None + hidden_states_26 = torch.nn.functional.layer_norm( + add_11, + (768,), + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_layer_norm_parameters_bias_, + 1e-05, + ) + add_11 = l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = (None) + hidden_states_27 = torch._C._nn.linear( + hidden_states_26, + l_self_modules_encoder_modules_layer_modules_3_modules_intermediate_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_3_modules_intermediate_modules_dense_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_3_modules_intermediate_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_3_modules_intermediate_modules_dense_parameters_bias_ = (None) + hidden_states_28 = torch._C._nn.gelu(hidden_states_27) + hidden_states_27 = None + hidden_states_29 = torch._C._nn.linear( + hidden_states_28, + l_self_modules_encoder_modules_layer_modules_3_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_3_modules_output_modules_dense_parameters_bias_, + ) + hidden_states_28 = l_self_modules_encoder_modules_layer_modules_3_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_3_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_30 = torch.nn.functional.dropout( + hidden_states_29, 0.1, False, False + ) + hidden_states_29 = None + add_12 = hidden_states_30 + hidden_states_26 + hidden_states_30 = hidden_states_26 = None + hidden_states_31 = torch.nn.functional.layer_norm( + add_12, + (768,), + l_self_modules_encoder_modules_layer_modules_3_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_3_modules_output_modules_layer_norm_parameters_bias_, + 1e-05, + ) + add_12 = l_self_modules_encoder_modules_layer_modules_3_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_3_modules_output_modules_layer_norm_parameters_bias_ = (None) + linear_24 = torch._C._nn.linear( + hidden_states_31, + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_query_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_query_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_query_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_query_parameters_bias_ = (None) + view_16 = linear_24.view(1, -1, 12, 64) + linear_24 = None + query_layer_8 = view_16.transpose(1, 2) + view_16 = None + linear_25 = torch._C._nn.linear( + hidden_states_31, + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_key_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_key_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_key_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_key_parameters_bias_ = (None) + view_17 = linear_25.view(1, -1, 12, 64) + linear_25 = None + key_layer_8 = view_17.transpose(1, 2) + view_17 = None + linear_26 = torch._C._nn.linear( + hidden_states_31, + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_value_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_value_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_value_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_value_parameters_bias_ = (None) + view_18 = linear_26.view(1, -1, 12, 64) + linear_26 = None + value_layer_4 = view_18.transpose(1, 2) + view_18 = None + query_layer_9 = query_layer_8 / 2.8284271247461903 + query_layer_8 = None + key_layer_9 = key_layer_8 / 2.8284271247461903 + key_layer_8 = None + transpose_19 = key_layer_9.transpose(-1, -2) + key_layer_9 = None + attention_scores_8 = torch.matmul(query_layer_9, transpose_19) + query_layer_9 = transpose_19 = None + attention_scores_9 = attention_scores_8 + extended_attention_mask_2 + attention_scores_8 = None + attention_probs_4 = torch.nn.functional.softmax(attention_scores_9, dim=-1) + attention_scores_9 = None + context_layer_16 = torch.matmul(attention_probs_4, value_layer_4) + attention_probs_4 = None + conv2d_4 = torch.conv2d( + value_layer_4, + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_conv_parameters_weight_, + None, + (1, 1), + (32, 0), + (1, 1), + 12, + ) + value_layer_4 = l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_conv_parameters_weight_ = (None) + context_layer_16 += conv2d_4 + context_layer_17 = context_layer_16 + context_layer_16 = conv2d_4 = None + permute_4 = context_layer_17.permute(0, 2, 1, 3) + context_layer_17 = None + context_layer_18 = permute_4.contiguous() + permute_4 = None + context_layer_19 = context_layer_18.view(1, 16, 768) + context_layer_18 = None + hidden_states_32 = torch._C._nn.linear( + context_layer_19, + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_dense_parameters_bias_, + ) + context_layer_19 = l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_33 = torch.nn.functional.dropout( + hidden_states_32, 0.1, False, False + ) + hidden_states_32 = None + add_14 = hidden_states_33 + hidden_states_31 + hidden_states_33 = hidden_states_31 = None + hidden_states_34 = torch.nn.functional.layer_norm( + add_14, + (768,), + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_layer_norm_parameters_bias_, + 1e-05, + ) + add_14 = l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = (None) + hidden_states_35 = torch._C._nn.linear( + hidden_states_34, + l_self_modules_encoder_modules_layer_modules_4_modules_intermediate_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_4_modules_intermediate_modules_dense_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_4_modules_intermediate_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_4_modules_intermediate_modules_dense_parameters_bias_ = (None) + hidden_states_36 = torch._C._nn.gelu(hidden_states_35) + hidden_states_35 = None + hidden_states_37 = torch._C._nn.linear( + hidden_states_36, + l_self_modules_encoder_modules_layer_modules_4_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_4_modules_output_modules_dense_parameters_bias_, + ) + hidden_states_36 = l_self_modules_encoder_modules_layer_modules_4_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_4_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_38 = torch.nn.functional.dropout( + hidden_states_37, 0.1, False, False + ) + hidden_states_37 = None + add_15 = hidden_states_38 + hidden_states_34 + hidden_states_38 = hidden_states_34 = None + hidden_states_39 = torch.nn.functional.layer_norm( + add_15, + (768,), + l_self_modules_encoder_modules_layer_modules_4_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_4_modules_output_modules_layer_norm_parameters_bias_, + 1e-05, + ) + add_15 = l_self_modules_encoder_modules_layer_modules_4_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_4_modules_output_modules_layer_norm_parameters_bias_ = (None) + linear_30 = torch._C._nn.linear( + hidden_states_39, + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_query_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_query_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_query_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_query_parameters_bias_ = (None) + view_20 = linear_30.view(1, -1, 12, 64) + linear_30 = None + query_layer_10 = view_20.transpose(1, 2) + view_20 = None + linear_31 = torch._C._nn.linear( + hidden_states_39, + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_key_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_key_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_key_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_key_parameters_bias_ = (None) + view_21 = linear_31.view(1, -1, 12, 64) + linear_31 = None + key_layer_10 = view_21.transpose(1, 2) + view_21 = None + linear_32 = torch._C._nn.linear( + hidden_states_39, + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_value_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_value_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_value_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_value_parameters_bias_ = (None) + view_22 = linear_32.view(1, -1, 12, 64) + linear_32 = None + value_layer_5 = view_22.transpose(1, 2) + view_22 = None + query_layer_11 = query_layer_10 / 2.8284271247461903 + query_layer_10 = None + key_layer_11 = key_layer_10 / 2.8284271247461903 + key_layer_10 = None + transpose_23 = key_layer_11.transpose(-1, -2) + key_layer_11 = None + attention_scores_10 = torch.matmul(query_layer_11, transpose_23) + query_layer_11 = transpose_23 = None + attention_scores_11 = attention_scores_10 + extended_attention_mask_2 + attention_scores_10 = None + attention_probs_5 = torch.nn.functional.softmax(attention_scores_11, dim=-1) + attention_scores_11 = None + context_layer_20 = torch.matmul(attention_probs_5, value_layer_5) + attention_probs_5 = None + conv2d_5 = torch.conv2d( + value_layer_5, + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_conv_parameters_weight_, + None, + (1, 1), + (32, 0), + (1, 1), + 12, + ) + value_layer_5 = l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_conv_parameters_weight_ = (None) + context_layer_20 += conv2d_5 + context_layer_21 = context_layer_20 + context_layer_20 = conv2d_5 = None + permute_5 = context_layer_21.permute(0, 2, 1, 3) + context_layer_21 = None + context_layer_22 = permute_5.contiguous() + permute_5 = None + context_layer_23 = context_layer_22.view(1, 16, 768) + context_layer_22 = None + hidden_states_40 = torch._C._nn.linear( + context_layer_23, + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_dense_parameters_bias_, + ) + context_layer_23 = l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_41 = torch.nn.functional.dropout( + hidden_states_40, 0.1, False, False + ) + hidden_states_40 = None + add_17 = hidden_states_41 + hidden_states_39 + hidden_states_41 = hidden_states_39 = None + hidden_states_42 = torch.nn.functional.layer_norm( + add_17, + (768,), + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_layer_norm_parameters_bias_, + 1e-05, + ) + add_17 = l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = (None) + hidden_states_43 = torch._C._nn.linear( + hidden_states_42, + l_self_modules_encoder_modules_layer_modules_5_modules_intermediate_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_5_modules_intermediate_modules_dense_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_5_modules_intermediate_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_5_modules_intermediate_modules_dense_parameters_bias_ = (None) + hidden_states_44 = torch._C._nn.gelu(hidden_states_43) + hidden_states_43 = None + hidden_states_45 = torch._C._nn.linear( + hidden_states_44, + l_self_modules_encoder_modules_layer_modules_5_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_5_modules_output_modules_dense_parameters_bias_, + ) + hidden_states_44 = l_self_modules_encoder_modules_layer_modules_5_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_5_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_46 = torch.nn.functional.dropout( + hidden_states_45, 0.1, False, False + ) + hidden_states_45 = None + add_18 = hidden_states_46 + hidden_states_42 + hidden_states_46 = hidden_states_42 = None + hidden_states_47 = torch.nn.functional.layer_norm( + add_18, + (768,), + l_self_modules_encoder_modules_layer_modules_5_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_5_modules_output_modules_layer_norm_parameters_bias_, + 1e-05, + ) + add_18 = l_self_modules_encoder_modules_layer_modules_5_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_5_modules_output_modules_layer_norm_parameters_bias_ = (None) + linear_36 = torch._C._nn.linear( + hidden_states_47, + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_query_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_query_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_query_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_query_parameters_bias_ = (None) + view_24 = linear_36.view(1, -1, 12, 64) + linear_36 = None + query_layer_12 = view_24.transpose(1, 2) + view_24 = None + linear_37 = torch._C._nn.linear( + hidden_states_47, + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_key_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_key_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_key_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_key_parameters_bias_ = (None) + view_25 = linear_37.view(1, -1, 12, 64) + linear_37 = None + key_layer_12 = view_25.transpose(1, 2) + view_25 = None + linear_38 = torch._C._nn.linear( + hidden_states_47, + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_value_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_value_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_value_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_value_parameters_bias_ = (None) + view_26 = linear_38.view(1, -1, 12, 64) + linear_38 = None + value_layer_6 = view_26.transpose(1, 2) + view_26 = None + query_layer_13 = query_layer_12 / 2.8284271247461903 + query_layer_12 = None + key_layer_13 = key_layer_12 / 2.8284271247461903 + key_layer_12 = None + transpose_27 = key_layer_13.transpose(-1, -2) + key_layer_13 = None + attention_scores_12 = torch.matmul(query_layer_13, transpose_27) + query_layer_13 = transpose_27 = None + attention_scores_13 = attention_scores_12 + extended_attention_mask_2 + attention_scores_12 = None + attention_probs_6 = torch.nn.functional.softmax(attention_scores_13, dim=-1) + attention_scores_13 = None + context_layer_24 = torch.matmul(attention_probs_6, value_layer_6) + attention_probs_6 = None + conv2d_6 = torch.conv2d( + value_layer_6, + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_conv_parameters_weight_, + None, + (1, 1), + (32, 0), + (1, 1), + 12, + ) + value_layer_6 = l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_conv_parameters_weight_ = (None) + context_layer_24 += conv2d_6 + context_layer_25 = context_layer_24 + context_layer_24 = conv2d_6 = None + permute_6 = context_layer_25.permute(0, 2, 1, 3) + context_layer_25 = None + context_layer_26 = permute_6.contiguous() + permute_6 = None + context_layer_27 = context_layer_26.view(1, 16, 768) + context_layer_26 = None + hidden_states_48 = torch._C._nn.linear( + context_layer_27, + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_dense_parameters_bias_, + ) + context_layer_27 = l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_49 = torch.nn.functional.dropout( + hidden_states_48, 0.1, False, False + ) + hidden_states_48 = None + add_20 = hidden_states_49 + hidden_states_47 + hidden_states_49 = hidden_states_47 = None + hidden_states_50 = torch.nn.functional.layer_norm( + add_20, + (768,), + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_layer_norm_parameters_bias_, + 1e-05, + ) + add_20 = l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = (None) + hidden_states_51 = torch._C._nn.linear( + hidden_states_50, + l_self_modules_encoder_modules_layer_modules_6_modules_intermediate_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_6_modules_intermediate_modules_dense_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_6_modules_intermediate_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_6_modules_intermediate_modules_dense_parameters_bias_ = (None) + hidden_states_52 = torch._C._nn.gelu(hidden_states_51) + hidden_states_51 = None + hidden_states_53 = torch._C._nn.linear( + hidden_states_52, + l_self_modules_encoder_modules_layer_modules_6_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_6_modules_output_modules_dense_parameters_bias_, + ) + hidden_states_52 = l_self_modules_encoder_modules_layer_modules_6_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_6_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_54 = torch.nn.functional.dropout( + hidden_states_53, 0.1, False, False + ) + hidden_states_53 = None + add_21 = hidden_states_54 + hidden_states_50 + hidden_states_54 = hidden_states_50 = None + hidden_states_55 = torch.nn.functional.layer_norm( + add_21, + (768,), + l_self_modules_encoder_modules_layer_modules_6_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_6_modules_output_modules_layer_norm_parameters_bias_, + 1e-05, + ) + add_21 = l_self_modules_encoder_modules_layer_modules_6_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_6_modules_output_modules_layer_norm_parameters_bias_ = (None) + linear_42 = torch._C._nn.linear( + hidden_states_55, + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_query_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_query_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_query_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_query_parameters_bias_ = (None) + view_28 = linear_42.view(1, -1, 12, 64) + linear_42 = None + query_layer_14 = view_28.transpose(1, 2) + view_28 = None + linear_43 = torch._C._nn.linear( + hidden_states_55, + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_key_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_key_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_key_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_key_parameters_bias_ = (None) + view_29 = linear_43.view(1, -1, 12, 64) + linear_43 = None + key_layer_14 = view_29.transpose(1, 2) + view_29 = None + linear_44 = torch._C._nn.linear( + hidden_states_55, + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_value_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_value_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_value_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_value_parameters_bias_ = (None) + view_30 = linear_44.view(1, -1, 12, 64) + linear_44 = None + value_layer_7 = view_30.transpose(1, 2) + view_30 = None + query_layer_15 = query_layer_14 / 2.8284271247461903 + query_layer_14 = None + key_layer_15 = key_layer_14 / 2.8284271247461903 + key_layer_14 = None + transpose_31 = key_layer_15.transpose(-1, -2) + key_layer_15 = None + attention_scores_14 = torch.matmul(query_layer_15, transpose_31) + query_layer_15 = transpose_31 = None + attention_scores_15 = attention_scores_14 + extended_attention_mask_2 + attention_scores_14 = None + attention_probs_7 = torch.nn.functional.softmax(attention_scores_15, dim=-1) + attention_scores_15 = None + context_layer_28 = torch.matmul(attention_probs_7, value_layer_7) + attention_probs_7 = None + conv2d_7 = torch.conv2d( + value_layer_7, + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_conv_parameters_weight_, + None, + (1, 1), + (32, 0), + (1, 1), + 12, + ) + value_layer_7 = l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_conv_parameters_weight_ = (None) + context_layer_28 += conv2d_7 + context_layer_29 = context_layer_28 + context_layer_28 = conv2d_7 = None + permute_7 = context_layer_29.permute(0, 2, 1, 3) + context_layer_29 = None + context_layer_30 = permute_7.contiguous() + permute_7 = None + context_layer_31 = context_layer_30.view(1, 16, 768) + context_layer_30 = None + hidden_states_56 = torch._C._nn.linear( + context_layer_31, + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_dense_parameters_bias_, + ) + context_layer_31 = l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_57 = torch.nn.functional.dropout( + hidden_states_56, 0.1, False, False + ) + hidden_states_56 = None + add_23 = hidden_states_57 + hidden_states_55 + hidden_states_57 = hidden_states_55 = None + hidden_states_58 = torch.nn.functional.layer_norm( + add_23, + (768,), + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_layer_norm_parameters_bias_, + 1e-05, + ) + add_23 = l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = (None) + hidden_states_59 = torch._C._nn.linear( + hidden_states_58, + l_self_modules_encoder_modules_layer_modules_7_modules_intermediate_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_7_modules_intermediate_modules_dense_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_7_modules_intermediate_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_7_modules_intermediate_modules_dense_parameters_bias_ = (None) + hidden_states_60 = torch._C._nn.gelu(hidden_states_59) + hidden_states_59 = None + hidden_states_61 = torch._C._nn.linear( + hidden_states_60, + l_self_modules_encoder_modules_layer_modules_7_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_7_modules_output_modules_dense_parameters_bias_, + ) + hidden_states_60 = l_self_modules_encoder_modules_layer_modules_7_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_7_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_62 = torch.nn.functional.dropout( + hidden_states_61, 0.1, False, False + ) + hidden_states_61 = None + add_24 = hidden_states_62 + hidden_states_58 + hidden_states_62 = hidden_states_58 = None + hidden_states_63 = torch.nn.functional.layer_norm( + add_24, + (768,), + l_self_modules_encoder_modules_layer_modules_7_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_7_modules_output_modules_layer_norm_parameters_bias_, + 1e-05, + ) + add_24 = l_self_modules_encoder_modules_layer_modules_7_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_7_modules_output_modules_layer_norm_parameters_bias_ = (None) + linear_48 = torch._C._nn.linear( + hidden_states_63, + l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_query_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_query_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_query_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_query_parameters_bias_ = (None) + view_32 = linear_48.view(1, -1, 12, 64) + linear_48 = None + query_layer_16 = view_32.transpose(1, 2) + view_32 = None + linear_49 = torch._C._nn.linear( + hidden_states_63, + l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_key_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_key_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_key_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_key_parameters_bias_ = (None) + view_33 = linear_49.view(1, -1, 12, 64) + linear_49 = None + key_layer_16 = view_33.transpose(1, 2) + view_33 = None + linear_50 = torch._C._nn.linear( + hidden_states_63, + l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_value_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_value_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_value_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_value_parameters_bias_ = (None) + view_34 = linear_50.view(1, -1, 12, 64) + linear_50 = None + value_layer_8 = view_34.transpose(1, 2) + view_34 = None + query_layer_17 = query_layer_16 / 2.8284271247461903 + query_layer_16 = None + key_layer_17 = key_layer_16 / 2.8284271247461903 + key_layer_16 = None + transpose_35 = key_layer_17.transpose(-1, -2) + key_layer_17 = None + attention_scores_16 = torch.matmul(query_layer_17, transpose_35) + query_layer_17 = transpose_35 = None + attention_scores_17 = attention_scores_16 + extended_attention_mask_2 + attention_scores_16 = None + attention_probs_8 = torch.nn.functional.softmax(attention_scores_17, dim=-1) + attention_scores_17 = None + context_layer_32 = torch.matmul(attention_probs_8, value_layer_8) + attention_probs_8 = None + conv2d_8 = torch.conv2d( + value_layer_8, + l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_conv_parameters_weight_, + None, + (1, 1), + (32, 0), + (1, 1), + 12, + ) + value_layer_8 = l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_conv_parameters_weight_ = (None) + context_layer_32 += conv2d_8 + context_layer_33 = context_layer_32 + context_layer_32 = conv2d_8 = None + permute_8 = context_layer_33.permute(0, 2, 1, 3) + context_layer_33 = None + context_layer_34 = permute_8.contiguous() + permute_8 = None + context_layer_35 = context_layer_34.view(1, 16, 768) + context_layer_34 = None + hidden_states_64 = torch._C._nn.linear( + context_layer_35, + l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_dense_parameters_bias_, + ) + context_layer_35 = l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_65 = torch.nn.functional.dropout( + hidden_states_64, 0.1, False, False + ) + hidden_states_64 = None + add_26 = hidden_states_65 + hidden_states_63 + hidden_states_65 = hidden_states_63 = None + hidden_states_66 = torch.nn.functional.layer_norm( + add_26, + (768,), + l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_layer_norm_parameters_bias_, + 1e-05, + ) + add_26 = l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = (None) + hidden_states_67 = torch._C._nn.linear( + hidden_states_66, + l_self_modules_encoder_modules_layer_modules_8_modules_intermediate_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_8_modules_intermediate_modules_dense_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_8_modules_intermediate_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_8_modules_intermediate_modules_dense_parameters_bias_ = (None) + hidden_states_68 = torch._C._nn.gelu(hidden_states_67) + hidden_states_67 = None + hidden_states_69 = torch._C._nn.linear( + hidden_states_68, + l_self_modules_encoder_modules_layer_modules_8_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_8_modules_output_modules_dense_parameters_bias_, + ) + hidden_states_68 = l_self_modules_encoder_modules_layer_modules_8_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_8_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_70 = torch.nn.functional.dropout( + hidden_states_69, 0.1, False, False + ) + hidden_states_69 = None + add_27 = hidden_states_70 + hidden_states_66 + hidden_states_70 = hidden_states_66 = None + hidden_states_71 = torch.nn.functional.layer_norm( + add_27, + (768,), + l_self_modules_encoder_modules_layer_modules_8_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_8_modules_output_modules_layer_norm_parameters_bias_, + 1e-05, + ) + add_27 = l_self_modules_encoder_modules_layer_modules_8_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_8_modules_output_modules_layer_norm_parameters_bias_ = (None) + linear_54 = torch._C._nn.linear( + hidden_states_71, + l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_query_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_query_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_query_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_query_parameters_bias_ = (None) + view_36 = linear_54.view(1, -1, 12, 64) + linear_54 = None + query_layer_18 = view_36.transpose(1, 2) + view_36 = None + linear_55 = torch._C._nn.linear( + hidden_states_71, + l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_key_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_key_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_key_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_key_parameters_bias_ = (None) + view_37 = linear_55.view(1, -1, 12, 64) + linear_55 = None + key_layer_18 = view_37.transpose(1, 2) + view_37 = None + linear_56 = torch._C._nn.linear( + hidden_states_71, + l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_value_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_value_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_value_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_value_parameters_bias_ = (None) + view_38 = linear_56.view(1, -1, 12, 64) + linear_56 = None + value_layer_9 = view_38.transpose(1, 2) + view_38 = None + query_layer_19 = query_layer_18 / 2.8284271247461903 + query_layer_18 = None + key_layer_19 = key_layer_18 / 2.8284271247461903 + key_layer_18 = None + transpose_39 = key_layer_19.transpose(-1, -2) + key_layer_19 = None + attention_scores_18 = torch.matmul(query_layer_19, transpose_39) + query_layer_19 = transpose_39 = None + attention_scores_19 = attention_scores_18 + extended_attention_mask_2 + attention_scores_18 = None + attention_probs_9 = torch.nn.functional.softmax(attention_scores_19, dim=-1) + attention_scores_19 = None + context_layer_36 = torch.matmul(attention_probs_9, value_layer_9) + attention_probs_9 = None + conv2d_9 = torch.conv2d( + value_layer_9, + l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_conv_parameters_weight_, + None, + (1, 1), + (32, 0), + (1, 1), + 12, + ) + value_layer_9 = l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_conv_parameters_weight_ = (None) + context_layer_36 += conv2d_9 + context_layer_37 = context_layer_36 + context_layer_36 = conv2d_9 = None + permute_9 = context_layer_37.permute(0, 2, 1, 3) + context_layer_37 = None + context_layer_38 = permute_9.contiguous() + permute_9 = None + context_layer_39 = context_layer_38.view(1, 16, 768) + context_layer_38 = None + hidden_states_72 = torch._C._nn.linear( + context_layer_39, + l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_dense_parameters_bias_, + ) + context_layer_39 = l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_73 = torch.nn.functional.dropout( + hidden_states_72, 0.1, False, False + ) + hidden_states_72 = None + add_29 = hidden_states_73 + hidden_states_71 + hidden_states_73 = hidden_states_71 = None + hidden_states_74 = torch.nn.functional.layer_norm( + add_29, + (768,), + l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_layer_norm_parameters_bias_, + 1e-05, + ) + add_29 = l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = (None) + hidden_states_75 = torch._C._nn.linear( + hidden_states_74, + l_self_modules_encoder_modules_layer_modules_9_modules_intermediate_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_9_modules_intermediate_modules_dense_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_9_modules_intermediate_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_9_modules_intermediate_modules_dense_parameters_bias_ = (None) + hidden_states_76 = torch._C._nn.gelu(hidden_states_75) + hidden_states_75 = None + hidden_states_77 = torch._C._nn.linear( + hidden_states_76, + l_self_modules_encoder_modules_layer_modules_9_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_9_modules_output_modules_dense_parameters_bias_, + ) + hidden_states_76 = l_self_modules_encoder_modules_layer_modules_9_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_9_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_78 = torch.nn.functional.dropout( + hidden_states_77, 0.1, False, False + ) + hidden_states_77 = None + add_30 = hidden_states_78 + hidden_states_74 + hidden_states_78 = hidden_states_74 = None + hidden_states_79 = torch.nn.functional.layer_norm( + add_30, + (768,), + l_self_modules_encoder_modules_layer_modules_9_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_9_modules_output_modules_layer_norm_parameters_bias_, + 1e-05, + ) + add_30 = l_self_modules_encoder_modules_layer_modules_9_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_9_modules_output_modules_layer_norm_parameters_bias_ = (None) + linear_60 = torch._C._nn.linear( + hidden_states_79, + l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_query_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_query_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_query_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_query_parameters_bias_ = (None) + view_40 = linear_60.view(1, -1, 12, 64) + linear_60 = None + query_layer_20 = view_40.transpose(1, 2) + view_40 = None + linear_61 = torch._C._nn.linear( + hidden_states_79, + l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_key_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_key_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_key_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_key_parameters_bias_ = (None) + view_41 = linear_61.view(1, -1, 12, 64) + linear_61 = None + key_layer_20 = view_41.transpose(1, 2) + view_41 = None + linear_62 = torch._C._nn.linear( + hidden_states_79, + l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_value_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_value_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_value_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_value_parameters_bias_ = (None) + view_42 = linear_62.view(1, -1, 12, 64) + linear_62 = None + value_layer_10 = view_42.transpose(1, 2) + view_42 = None + query_layer_21 = query_layer_20 / 2.8284271247461903 + query_layer_20 = None + key_layer_21 = key_layer_20 / 2.8284271247461903 + key_layer_20 = None + transpose_43 = key_layer_21.transpose(-1, -2) + key_layer_21 = None + attention_scores_20 = torch.matmul(query_layer_21, transpose_43) + query_layer_21 = transpose_43 = None + attention_scores_21 = attention_scores_20 + extended_attention_mask_2 + attention_scores_20 = None + attention_probs_10 = torch.nn.functional.softmax(attention_scores_21, dim=-1) + attention_scores_21 = None + context_layer_40 = torch.matmul(attention_probs_10, value_layer_10) + attention_probs_10 = None + conv2d_10 = torch.conv2d( + value_layer_10, + l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_conv_parameters_weight_, + None, + (1, 1), + (32, 0), + (1, 1), + 12, + ) + value_layer_10 = l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_conv_parameters_weight_ = (None) + context_layer_40 += conv2d_10 + context_layer_41 = context_layer_40 + context_layer_40 = conv2d_10 = None + permute_10 = context_layer_41.permute(0, 2, 1, 3) + context_layer_41 = None + context_layer_42 = permute_10.contiguous() + permute_10 = None + context_layer_43 = context_layer_42.view(1, 16, 768) + context_layer_42 = None + hidden_states_80 = torch._C._nn.linear( + context_layer_43, + l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_dense_parameters_bias_, + ) + context_layer_43 = l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_81 = torch.nn.functional.dropout( + hidden_states_80, 0.1, False, False + ) + hidden_states_80 = None + add_32 = hidden_states_81 + hidden_states_79 + hidden_states_81 = hidden_states_79 = None + hidden_states_82 = torch.nn.functional.layer_norm( + add_32, + (768,), + l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_layer_norm_parameters_bias_, + 1e-05, + ) + add_32 = l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = (None) + hidden_states_83 = torch._C._nn.linear( + hidden_states_82, + l_self_modules_encoder_modules_layer_modules_10_modules_intermediate_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_10_modules_intermediate_modules_dense_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_10_modules_intermediate_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_10_modules_intermediate_modules_dense_parameters_bias_ = (None) + hidden_states_84 = torch._C._nn.gelu(hidden_states_83) + hidden_states_83 = None + hidden_states_85 = torch._C._nn.linear( + hidden_states_84, + l_self_modules_encoder_modules_layer_modules_10_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_10_modules_output_modules_dense_parameters_bias_, + ) + hidden_states_84 = l_self_modules_encoder_modules_layer_modules_10_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_10_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_86 = torch.nn.functional.dropout( + hidden_states_85, 0.1, False, False + ) + hidden_states_85 = None + add_33 = hidden_states_86 + hidden_states_82 + hidden_states_86 = hidden_states_82 = None + hidden_states_87 = torch.nn.functional.layer_norm( + add_33, + (768,), + l_self_modules_encoder_modules_layer_modules_10_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_10_modules_output_modules_layer_norm_parameters_bias_, + 1e-05, + ) + add_33 = l_self_modules_encoder_modules_layer_modules_10_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_10_modules_output_modules_layer_norm_parameters_bias_ = (None) + linear_66 = torch._C._nn.linear( + hidden_states_87, + l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_query_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_query_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_query_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_query_parameters_bias_ = (None) + view_44 = linear_66.view(1, -1, 12, 64) + linear_66 = None + query_layer_22 = view_44.transpose(1, 2) + view_44 = None + linear_67 = torch._C._nn.linear( + hidden_states_87, + l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_key_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_key_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_key_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_key_parameters_bias_ = (None) + view_45 = linear_67.view(1, -1, 12, 64) + linear_67 = None + key_layer_22 = view_45.transpose(1, 2) + view_45 = None + linear_68 = torch._C._nn.linear( + hidden_states_87, + l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_value_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_value_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_value_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_value_parameters_bias_ = (None) + view_46 = linear_68.view(1, -1, 12, 64) + linear_68 = None + value_layer_11 = view_46.transpose(1, 2) + view_46 = None + query_layer_23 = query_layer_22 / 2.8284271247461903 + query_layer_22 = None + key_layer_23 = key_layer_22 / 2.8284271247461903 + key_layer_22 = None + transpose_47 = key_layer_23.transpose(-1, -2) + key_layer_23 = None + attention_scores_22 = torch.matmul(query_layer_23, transpose_47) + query_layer_23 = transpose_47 = None + attention_scores_23 = attention_scores_22 + extended_attention_mask_2 + attention_scores_22 = extended_attention_mask_2 = None + attention_probs_11 = torch.nn.functional.softmax(attention_scores_23, dim=-1) + attention_scores_23 = None + context_layer_44 = torch.matmul(attention_probs_11, value_layer_11) + attention_probs_11 = None + conv2d_11 = torch.conv2d( + value_layer_11, + l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_conv_parameters_weight_, + None, + (1, 1), + (32, 0), + (1, 1), + 12, + ) + value_layer_11 = l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_conv_parameters_weight_ = (None) + context_layer_44 += conv2d_11 + context_layer_45 = context_layer_44 + context_layer_44 = conv2d_11 = None + permute_11 = context_layer_45.permute(0, 2, 1, 3) + context_layer_45 = None + context_layer_46 = permute_11.contiguous() + permute_11 = None + context_layer_47 = context_layer_46.view(1, 16, 768) + context_layer_46 = None + hidden_states_88 = torch._C._nn.linear( + context_layer_47, + l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_dense_parameters_bias_, + ) + context_layer_47 = l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_89 = torch.nn.functional.dropout( + hidden_states_88, 0.1, False, False + ) + hidden_states_88 = None + add_35 = hidden_states_89 + hidden_states_87 + hidden_states_89 = hidden_states_87 = None + hidden_states_90 = torch.nn.functional.layer_norm( + add_35, + (768,), + l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_layer_norm_parameters_bias_, + 1e-05, + ) + add_35 = l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = (None) + hidden_states_91 = torch._C._nn.linear( + hidden_states_90, + l_self_modules_encoder_modules_layer_modules_11_modules_intermediate_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_11_modules_intermediate_modules_dense_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_11_modules_intermediate_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_11_modules_intermediate_modules_dense_parameters_bias_ = (None) + hidden_states_92 = torch._C._nn.gelu(hidden_states_91) + hidden_states_91 = None + hidden_states_93 = torch._C._nn.linear( + hidden_states_92, + l_self_modules_encoder_modules_layer_modules_11_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_11_modules_output_modules_dense_parameters_bias_, + ) + hidden_states_92 = l_self_modules_encoder_modules_layer_modules_11_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_11_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_94 = torch.nn.functional.dropout( + hidden_states_93, 0.1, False, False + ) + hidden_states_93 = None + add_36 = hidden_states_94 + hidden_states_90 + hidden_states_94 = hidden_states_90 = None + hidden_states_95 = torch.nn.functional.layer_norm( + add_36, + (768,), + l_self_modules_encoder_modules_layer_modules_11_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_11_modules_output_modules_layer_norm_parameters_bias_, + 1e-05, + ) + add_36 = l_self_modules_encoder_modules_layer_modules_11_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_11_modules_output_modules_layer_norm_parameters_bias_ = (None) + return (hidden_states_95,) diff --git a/samples/transformers-auto-model/ogoshi2000_stance-nystromformer/weight_meta.py b/samples/transformers-auto-model/ogoshi2000_stance-nystromformer/weight_meta.py new file mode 100644 index 000000000..ddafc62c6 --- /dev/null +++ b/samples/transformers-auto-model/ogoshi2000_stance-nystromformer/weight_meta.py @@ -0,0 +1,2147 @@ +class Program_weight_tensor_meta_L_input_ids_: + name = "L_input_ids_" + shape = [1, 16] + dtype = "torch.int64" + device = "cuda:0" + mean = None + std = None + data = [ + 0, + 21042, + 2295, + 14021, + 1877, + 45397, + 1379, + 3434, + 2291, + 4612, + 15531, + 18289, + 36823, + 54, + 4, + 2, + ] + + +class Program_weight_tensor_meta_L_attention_mask_: + name = "L_attention_mask_" + shape = [1, 16] + dtype = "torch.int64" + device = "cuda:0" + mean = None + std = None + data = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] + + +class Program_weight_tensor_meta_L_self_modules_embeddings_buffers_token_type_ids_: + name = "L_self_modules_embeddings_buffers_token_type_ids_" + shape = [1, 8192] + dtype = "torch.int64" + device = "cuda:0" + mean = None + std = None + min_val = 0 + max_val = 0 + + +class Program_weight_tensor_meta_L_self_modules_embeddings_modules_word_embeddings_parameters_weight_: + name = "L_self_modules_embeddings_modules_word_embeddings_parameters_weight_" + shape = [52009, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_embeddings_buffers_position_ids_: + name = "L_self_modules_embeddings_buffers_position_ids_" + shape = [1, 8192] + dtype = "torch.int64" + device = "cuda:0" + mean = None + std = None + min_val = 2 + max_val = 8193 + + +class Program_weight_tensor_meta_L_self_modules_embeddings_modules_token_type_embeddings_parameters_weight_: + name = "L_self_modules_embeddings_modules_token_type_embeddings_parameters_weight_" + shape = [1, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_embeddings_modules_position_embeddings_parameters_weight_: + name = "L_self_modules_embeddings_modules_position_embeddings_parameters_weight_" + shape = [8194, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_embeddings_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_embeddings_modules_LayerNorm_parameters_weight_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_embeddings_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_embeddings_modules_LayerNorm_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_query_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_query_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_query_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_query_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_key_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_key_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_key_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_key_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_value_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_value_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_value_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_value_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_conv_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_conv_parameters_weight_" + shape = [12, 1, 65, 1] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.001 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dense_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dense_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_LayerNorm_parameters_weight_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_LayerNorm_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_intermediate_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_intermediate_modules_dense_parameters_weight_" + shape = [3072, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_intermediate_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_intermediate_modules_dense_parameters_bias_" + shape = [3072] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dense_parameters_weight_" + shape = [768, 3072] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dense_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_LayerNorm_parameters_weight_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_LayerNorm_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_query_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_query_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_query_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_query_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_key_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_key_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_key_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_key_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_value_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_value_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_value_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_value_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_conv_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_conv_parameters_weight_" + shape = [12, 1, 65, 1] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.001 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dense_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dense_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_LayerNorm_parameters_weight_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_LayerNorm_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_intermediate_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_intermediate_modules_dense_parameters_weight_" + shape = [3072, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_intermediate_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_intermediate_modules_dense_parameters_bias_" + shape = [3072] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dense_parameters_weight_" + shape = [768, 3072] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dense_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_LayerNorm_parameters_weight_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_LayerNorm_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_query_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_query_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_query_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_query_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_key_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_key_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_key_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_key_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_value_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_value_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_value_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_value_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_conv_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_conv_parameters_weight_" + shape = [12, 1, 65, 1] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.001 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_dense_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_dense_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_LayerNorm_parameters_weight_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_LayerNorm_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_intermediate_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_intermediate_modules_dense_parameters_weight_" + shape = [3072, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_intermediate_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_intermediate_modules_dense_parameters_bias_" + shape = [3072] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_dense_parameters_weight_" + shape = [768, 3072] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_dense_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_LayerNorm_parameters_weight_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_LayerNorm_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_query_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_query_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_query_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_query_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_key_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_key_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_key_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_key_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_value_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_value_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_value_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_value_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_conv_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_conv_parameters_weight_" + shape = [12, 1, 65, 1] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.001 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_dense_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_dense_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_LayerNorm_parameters_weight_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_LayerNorm_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_intermediate_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_intermediate_modules_dense_parameters_weight_" + shape = [3072, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_intermediate_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_intermediate_modules_dense_parameters_bias_" + shape = [3072] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_dense_parameters_weight_" + shape = [768, 3072] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_dense_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_LayerNorm_parameters_weight_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_LayerNorm_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_query_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_query_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_query_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_query_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_key_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_key_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_key_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_key_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_value_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_value_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_value_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_value_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_conv_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_conv_parameters_weight_" + shape = [12, 1, 65, 1] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.001 + std = 0.021 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_dense_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_dense_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_LayerNorm_parameters_weight_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_LayerNorm_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_4_modules_intermediate_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_4_modules_intermediate_modules_dense_parameters_weight_" + shape = [3072, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_4_modules_intermediate_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_4_modules_intermediate_modules_dense_parameters_bias_" + shape = [3072] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_4_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_4_modules_output_modules_dense_parameters_weight_" + shape = [768, 3072] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_4_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_4_modules_output_modules_dense_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_4_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_4_modules_output_modules_LayerNorm_parameters_weight_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_4_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_4_modules_output_modules_LayerNorm_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_query_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_query_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_query_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_query_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_key_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_key_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_key_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_key_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_value_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_value_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_value_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_value_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_conv_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_conv_parameters_weight_" + shape = [12, 1, 65, 1] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.019 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_dense_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_dense_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_LayerNorm_parameters_weight_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_LayerNorm_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_5_modules_intermediate_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_5_modules_intermediate_modules_dense_parameters_weight_" + shape = [3072, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_5_modules_intermediate_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_5_modules_intermediate_modules_dense_parameters_bias_" + shape = [3072] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_5_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_5_modules_output_modules_dense_parameters_weight_" + shape = [768, 3072] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_5_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_5_modules_output_modules_dense_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_5_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_5_modules_output_modules_LayerNorm_parameters_weight_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_5_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_5_modules_output_modules_LayerNorm_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_query_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_query_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_query_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_query_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_key_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_key_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_key_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_key_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_value_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_value_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_value_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_value_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_conv_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_conv_parameters_weight_" + shape = [12, 1, 65, 1] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_dense_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_dense_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_LayerNorm_parameters_weight_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_LayerNorm_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_6_modules_intermediate_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_6_modules_intermediate_modules_dense_parameters_weight_" + shape = [3072, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_6_modules_intermediate_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_6_modules_intermediate_modules_dense_parameters_bias_" + shape = [3072] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_6_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_6_modules_output_modules_dense_parameters_weight_" + shape = [768, 3072] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_6_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_6_modules_output_modules_dense_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_6_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_6_modules_output_modules_LayerNorm_parameters_weight_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_6_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_6_modules_output_modules_LayerNorm_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_query_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_query_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_query_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_query_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_key_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_key_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_key_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_key_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_value_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_value_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_value_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_value_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_conv_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_conv_parameters_weight_" + shape = [12, 1, 65, 1] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_dense_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_dense_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_LayerNorm_parameters_weight_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_LayerNorm_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_7_modules_intermediate_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_7_modules_intermediate_modules_dense_parameters_weight_" + shape = [3072, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_7_modules_intermediate_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_7_modules_intermediate_modules_dense_parameters_bias_" + shape = [3072] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_7_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_7_modules_output_modules_dense_parameters_weight_" + shape = [768, 3072] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_7_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_7_modules_output_modules_dense_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_7_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_7_modules_output_modules_LayerNorm_parameters_weight_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_7_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_7_modules_output_modules_LayerNorm_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_query_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_query_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_query_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_query_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_key_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_key_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_key_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_key_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_value_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_value_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_value_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_value_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_conv_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_conv_parameters_weight_" + shape = [12, 1, 65, 1] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.001 + std = 0.021 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_dense_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_dense_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_LayerNorm_parameters_weight_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_LayerNorm_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_8_modules_intermediate_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_8_modules_intermediate_modules_dense_parameters_weight_" + shape = [3072, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_8_modules_intermediate_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_8_modules_intermediate_modules_dense_parameters_bias_" + shape = [3072] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_8_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_8_modules_output_modules_dense_parameters_weight_" + shape = [768, 3072] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_8_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_8_modules_output_modules_dense_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_8_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_8_modules_output_modules_LayerNorm_parameters_weight_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_8_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_8_modules_output_modules_LayerNorm_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_query_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_query_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_query_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_query_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_key_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_key_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_key_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_key_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_value_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_value_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_value_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_value_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_conv_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_conv_parameters_weight_" + shape = [12, 1, 65, 1] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.002 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_dense_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_dense_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_LayerNorm_parameters_weight_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_LayerNorm_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_9_modules_intermediate_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_9_modules_intermediate_modules_dense_parameters_weight_" + shape = [3072, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_9_modules_intermediate_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_9_modules_intermediate_modules_dense_parameters_bias_" + shape = [3072] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_9_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_9_modules_output_modules_dense_parameters_weight_" + shape = [768, 3072] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_9_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_9_modules_output_modules_dense_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_9_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_9_modules_output_modules_LayerNorm_parameters_weight_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_9_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_9_modules_output_modules_LayerNorm_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_query_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_query_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_query_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_query_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_key_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_key_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_key_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_key_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_value_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_value_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_value_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_value_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_conv_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_conv_parameters_weight_" + shape = [12, 1, 65, 1] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.001 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_dense_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_dense_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_LayerNorm_parameters_weight_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_LayerNorm_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_10_modules_intermediate_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_10_modules_intermediate_modules_dense_parameters_weight_" + shape = [3072, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_10_modules_intermediate_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_10_modules_intermediate_modules_dense_parameters_bias_" + shape = [3072] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_10_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_10_modules_output_modules_dense_parameters_weight_" + shape = [768, 3072] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_10_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_10_modules_output_modules_dense_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_10_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_10_modules_output_modules_LayerNorm_parameters_weight_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_10_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_10_modules_output_modules_LayerNorm_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_query_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_query_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_query_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_query_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_key_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_key_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_key_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_key_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_value_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_value_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_value_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_value_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_conv_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_conv_parameters_weight_" + shape = [12, 1, 65, 1] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.001 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_dense_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_dense_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_LayerNorm_parameters_weight_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_LayerNorm_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_11_modules_intermediate_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_11_modules_intermediate_modules_dense_parameters_weight_" + shape = [3072, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_11_modules_intermediate_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_11_modules_intermediate_modules_dense_parameters_bias_" + shape = [3072] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_11_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_11_modules_output_modules_dense_parameters_weight_" + shape = [768, 3072] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_11_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_11_modules_output_modules_dense_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_11_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_11_modules_output_modules_LayerNorm_parameters_weight_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_11_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_11_modules_output_modules_LayerNorm_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None diff --git a/samples/transformers-auto-model/paulkm_autotrain-lottery_v2-2420075389/graph_hash.txt b/samples/transformers-auto-model/paulkm_autotrain-lottery_v2-2420075389/graph_hash.txt new file mode 100644 index 000000000..8f93052c9 --- /dev/null +++ b/samples/transformers-auto-model/paulkm_autotrain-lottery_v2-2420075389/graph_hash.txt @@ -0,0 +1 @@ +710c54ca0e2f3f39ad6de1442bd12d73983db9fb2cead0dd36ff3d2ba62c31f3 \ No newline at end of file diff --git a/samples/transformers-auto-model/paulkm_autotrain-lottery_v2-2420075389/graph_net.json b/samples/transformers-auto-model/paulkm_autotrain-lottery_v2-2420075389/graph_net.json new file mode 100644 index 000000000..b6ffe9f72 --- /dev/null +++ b/samples/transformers-auto-model/paulkm_autotrain-lottery_v2-2420075389/graph_net.json @@ -0,0 +1,6 @@ +{ + "framework": "torch", + "num_devices_required": 1, + "num_nodes_required": 1, + "dynamic": false +} \ No newline at end of file diff --git a/samples/transformers-auto-model/paulkm_autotrain-lottery_v2-2420075389/input_meta.py b/samples/transformers-auto-model/paulkm_autotrain-lottery_v2-2420075389/input_meta.py new file mode 100644 index 000000000..e69de29bb diff --git a/samples/transformers-auto-model/paulkm_autotrain-lottery_v2-2420075389/input_tensor_constraints.py b/samples/transformers-auto-model/paulkm_autotrain-lottery_v2-2420075389/input_tensor_constraints.py new file mode 100644 index 000000000..e69de29bb diff --git a/samples/transformers-auto-model/paulkm_autotrain-lottery_v2-2420075389/model.py b/samples/transformers-auto-model/paulkm_autotrain-lottery_v2-2420075389/model.py new file mode 100644 index 000000000..71beaa13d --- /dev/null +++ b/samples/transformers-auto-model/paulkm_autotrain-lottery_v2-2420075389/model.py @@ -0,0 +1,3271 @@ +import torch + + +class GraphModule(torch.nn.Module): + def forward( + self, + L_input_ids_: torch.Tensor, + L_attention_mask_: torch.Tensor, + L_token_type_ids_: torch.Tensor, + L_self_modules_embeddings_modules_word_embeddings_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_embeddings_buffers_position_ids_: torch.Tensor, + L_self_modules_embeddings_modules_token_type_embeddings_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_embeddings_modules_position_embeddings_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_embeddings_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_embeddings_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_query_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_query_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_key_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_key_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_intermediate_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_intermediate_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_query_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_query_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_key_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_key_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_intermediate_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_intermediate_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_query_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_query_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_key_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_key_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_intermediate_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_intermediate_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_query_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_query_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_key_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_key_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_intermediate_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_intermediate_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_query_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_query_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_key_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_key_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_4_modules_intermediate_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_4_modules_intermediate_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_4_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_4_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_4_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_4_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_query_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_query_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_key_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_key_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_5_modules_intermediate_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_5_modules_intermediate_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_5_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_5_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_5_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_5_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_query_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_query_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_key_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_key_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_6_modules_intermediate_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_6_modules_intermediate_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_6_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_6_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_6_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_6_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_query_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_query_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_key_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_key_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_7_modules_intermediate_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_7_modules_intermediate_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_7_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_7_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_7_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_7_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_query_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_query_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_key_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_key_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_8_modules_intermediate_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_8_modules_intermediate_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_8_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_8_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_8_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_8_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_query_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_query_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_key_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_key_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_9_modules_intermediate_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_9_modules_intermediate_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_9_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_9_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_9_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_9_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_query_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_query_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_key_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_key_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_10_modules_intermediate_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_10_modules_intermediate_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_10_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_10_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_10_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_10_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_query_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_query_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_key_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_key_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_11_modules_intermediate_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_11_modules_intermediate_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_11_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_11_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_11_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_11_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_self_modules_query_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_self_modules_query_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_self_modules_key_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_self_modules_key_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_self_modules_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_self_modules_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_12_modules_intermediate_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_12_modules_intermediate_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_12_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_12_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_12_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_12_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_self_modules_query_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_self_modules_query_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_self_modules_key_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_self_modules_key_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_self_modules_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_self_modules_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_13_modules_intermediate_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_13_modules_intermediate_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_13_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_13_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_13_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_13_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_self_modules_query_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_self_modules_query_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_self_modules_key_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_self_modules_key_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_self_modules_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_self_modules_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_14_modules_intermediate_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_14_modules_intermediate_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_14_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_14_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_14_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_14_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_self_modules_query_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_self_modules_query_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_self_modules_key_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_self_modules_key_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_self_modules_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_self_modules_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_15_modules_intermediate_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_15_modules_intermediate_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_15_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_15_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_15_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_15_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_self_modules_query_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_self_modules_query_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_self_modules_key_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_self_modules_key_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_self_modules_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_self_modules_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_16_modules_intermediate_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_16_modules_intermediate_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_16_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_16_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_16_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_16_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_self_modules_query_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_self_modules_query_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_self_modules_key_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_self_modules_key_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_self_modules_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_self_modules_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_17_modules_intermediate_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_17_modules_intermediate_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_17_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_17_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_17_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_17_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_self_modules_query_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_self_modules_query_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_self_modules_key_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_self_modules_key_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_self_modules_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_self_modules_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_18_modules_intermediate_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_18_modules_intermediate_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_18_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_18_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_18_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_18_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_self_modules_query_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_self_modules_query_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_self_modules_key_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_self_modules_key_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_self_modules_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_self_modules_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_19_modules_intermediate_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_19_modules_intermediate_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_19_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_19_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_19_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_19_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_self_modules_query_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_self_modules_query_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_self_modules_key_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_self_modules_key_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_self_modules_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_self_modules_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_20_modules_intermediate_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_20_modules_intermediate_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_20_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_20_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_20_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_20_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_self_modules_query_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_self_modules_query_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_self_modules_key_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_self_modules_key_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_self_modules_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_self_modules_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_21_modules_intermediate_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_21_modules_intermediate_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_21_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_21_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_21_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_21_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_self_modules_query_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_self_modules_query_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_self_modules_key_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_self_modules_key_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_self_modules_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_self_modules_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_22_modules_intermediate_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_22_modules_intermediate_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_22_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_22_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_22_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_22_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_self_modules_query_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_self_modules_query_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_self_modules_key_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_self_modules_key_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_self_modules_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_self_modules_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_23_modules_intermediate_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_23_modules_intermediate_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_23_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_23_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_23_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_23_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + ): + l_input_ids_ = L_input_ids_ + l_attention_mask_ = L_attention_mask_ + l_token_type_ids_ = L_token_type_ids_ + l_self_modules_embeddings_modules_word_embeddings_parameters_weight_ = ( + L_self_modules_embeddings_modules_word_embeddings_parameters_weight_ + ) + l_self_modules_embeddings_buffers_position_ids_ = ( + L_self_modules_embeddings_buffers_position_ids_ + ) + l_self_modules_embeddings_modules_token_type_embeddings_parameters_weight_ = ( + L_self_modules_embeddings_modules_token_type_embeddings_parameters_weight_ + ) + l_self_modules_embeddings_modules_position_embeddings_parameters_weight_ = ( + L_self_modules_embeddings_modules_position_embeddings_parameters_weight_ + ) + l_self_modules_embeddings_modules_layer_norm_parameters_weight_ = ( + L_self_modules_embeddings_modules_LayerNorm_parameters_weight_ + ) + l_self_modules_embeddings_modules_layer_norm_parameters_bias_ = ( + L_self_modules_embeddings_modules_LayerNorm_parameters_bias_ + ) + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_query_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_query_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_query_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_query_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_key_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_key_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_key_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_key_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_value_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_value_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_value_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_value_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_0_modules_intermediate_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_0_modules_intermediate_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_0_modules_intermediate_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_0_modules_intermediate_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_0_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_0_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_query_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_query_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_query_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_query_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_key_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_key_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_key_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_key_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_value_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_value_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_value_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_value_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_1_modules_intermediate_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_1_modules_intermediate_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_1_modules_intermediate_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_1_modules_intermediate_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_1_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_1_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_query_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_query_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_query_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_query_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_key_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_key_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_key_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_key_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_value_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_value_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_value_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_value_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_2_modules_intermediate_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_2_modules_intermediate_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_2_modules_intermediate_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_2_modules_intermediate_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_2_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_2_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_2_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_2_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_query_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_query_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_query_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_query_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_key_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_key_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_key_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_key_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_value_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_value_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_value_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_value_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_3_modules_intermediate_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_3_modules_intermediate_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_3_modules_intermediate_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_3_modules_intermediate_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_3_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_3_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_3_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_3_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_query_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_query_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_query_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_query_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_key_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_key_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_key_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_key_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_value_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_value_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_value_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_value_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_4_modules_intermediate_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_4_modules_intermediate_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_4_modules_intermediate_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_4_modules_intermediate_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_4_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_4_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_4_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_4_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_4_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_4_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_4_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_4_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_query_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_query_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_query_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_query_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_key_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_key_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_key_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_key_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_value_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_value_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_value_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_value_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_5_modules_intermediate_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_5_modules_intermediate_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_5_modules_intermediate_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_5_modules_intermediate_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_5_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_5_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_5_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_5_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_5_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_5_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_5_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_5_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_query_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_query_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_query_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_query_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_key_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_key_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_key_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_key_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_value_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_value_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_value_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_value_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_6_modules_intermediate_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_6_modules_intermediate_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_6_modules_intermediate_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_6_modules_intermediate_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_6_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_6_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_6_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_6_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_6_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_6_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_6_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_6_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_query_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_query_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_query_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_query_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_key_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_key_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_key_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_key_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_value_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_value_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_value_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_value_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_7_modules_intermediate_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_7_modules_intermediate_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_7_modules_intermediate_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_7_modules_intermediate_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_7_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_7_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_7_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_7_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_7_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_7_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_7_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_7_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_query_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_query_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_query_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_query_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_key_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_key_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_key_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_key_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_value_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_value_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_value_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_value_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_8_modules_intermediate_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_8_modules_intermediate_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_8_modules_intermediate_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_8_modules_intermediate_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_8_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_8_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_8_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_8_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_8_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_8_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_8_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_8_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_query_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_query_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_query_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_query_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_key_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_key_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_key_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_key_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_value_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_value_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_value_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_value_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_9_modules_intermediate_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_9_modules_intermediate_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_9_modules_intermediate_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_9_modules_intermediate_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_9_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_9_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_9_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_9_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_9_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_9_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_9_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_9_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_query_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_query_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_query_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_query_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_key_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_key_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_key_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_key_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_value_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_value_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_value_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_value_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_10_modules_intermediate_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_10_modules_intermediate_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_10_modules_intermediate_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_10_modules_intermediate_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_10_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_10_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_10_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_10_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_10_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_10_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_10_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_10_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_query_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_query_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_query_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_query_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_key_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_key_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_key_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_key_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_value_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_value_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_value_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_value_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_11_modules_intermediate_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_11_modules_intermediate_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_11_modules_intermediate_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_11_modules_intermediate_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_11_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_11_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_11_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_11_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_11_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_11_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_11_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_11_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_self_modules_query_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_self_modules_query_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_self_modules_query_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_self_modules_query_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_self_modules_key_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_self_modules_key_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_self_modules_key_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_self_modules_key_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_self_modules_value_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_self_modules_value_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_self_modules_value_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_self_modules_value_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_12_modules_intermediate_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_12_modules_intermediate_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_12_modules_intermediate_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_12_modules_intermediate_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_12_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_12_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_12_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_12_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_12_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_12_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_12_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_12_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_self_modules_query_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_self_modules_query_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_self_modules_query_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_self_modules_query_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_self_modules_key_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_self_modules_key_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_self_modules_key_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_self_modules_key_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_self_modules_value_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_self_modules_value_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_self_modules_value_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_self_modules_value_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_13_modules_intermediate_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_13_modules_intermediate_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_13_modules_intermediate_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_13_modules_intermediate_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_13_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_13_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_13_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_13_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_13_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_13_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_13_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_13_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_self_modules_query_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_self_modules_query_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_self_modules_query_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_self_modules_query_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_self_modules_key_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_self_modules_key_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_self_modules_key_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_self_modules_key_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_self_modules_value_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_self_modules_value_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_self_modules_value_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_self_modules_value_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_14_modules_intermediate_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_14_modules_intermediate_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_14_modules_intermediate_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_14_modules_intermediate_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_14_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_14_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_14_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_14_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_14_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_14_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_14_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_14_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_self_modules_query_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_self_modules_query_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_self_modules_query_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_self_modules_query_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_self_modules_key_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_self_modules_key_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_self_modules_key_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_self_modules_key_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_self_modules_value_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_self_modules_value_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_self_modules_value_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_self_modules_value_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_15_modules_intermediate_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_15_modules_intermediate_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_15_modules_intermediate_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_15_modules_intermediate_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_15_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_15_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_15_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_15_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_15_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_15_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_15_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_15_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_self_modules_query_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_self_modules_query_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_self_modules_query_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_self_modules_query_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_self_modules_key_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_self_modules_key_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_self_modules_key_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_self_modules_key_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_self_modules_value_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_self_modules_value_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_self_modules_value_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_self_modules_value_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_16_modules_intermediate_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_16_modules_intermediate_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_16_modules_intermediate_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_16_modules_intermediate_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_16_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_16_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_16_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_16_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_16_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_16_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_16_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_16_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_self_modules_query_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_self_modules_query_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_self_modules_query_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_self_modules_query_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_self_modules_key_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_self_modules_key_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_self_modules_key_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_self_modules_key_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_self_modules_value_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_self_modules_value_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_self_modules_value_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_self_modules_value_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_17_modules_intermediate_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_17_modules_intermediate_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_17_modules_intermediate_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_17_modules_intermediate_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_17_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_17_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_17_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_17_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_17_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_17_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_17_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_17_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_self_modules_query_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_self_modules_query_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_self_modules_query_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_self_modules_query_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_self_modules_key_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_self_modules_key_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_self_modules_key_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_self_modules_key_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_self_modules_value_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_self_modules_value_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_self_modules_value_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_self_modules_value_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_18_modules_intermediate_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_18_modules_intermediate_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_18_modules_intermediate_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_18_modules_intermediate_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_18_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_18_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_18_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_18_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_18_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_18_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_18_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_18_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_self_modules_query_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_self_modules_query_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_self_modules_query_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_self_modules_query_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_self_modules_key_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_self_modules_key_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_self_modules_key_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_self_modules_key_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_self_modules_value_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_self_modules_value_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_self_modules_value_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_self_modules_value_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_19_modules_intermediate_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_19_modules_intermediate_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_19_modules_intermediate_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_19_modules_intermediate_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_19_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_19_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_19_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_19_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_19_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_19_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_19_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_19_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_self_modules_query_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_self_modules_query_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_self_modules_query_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_self_modules_query_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_self_modules_key_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_self_modules_key_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_self_modules_key_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_self_modules_key_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_self_modules_value_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_self_modules_value_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_self_modules_value_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_self_modules_value_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_20_modules_intermediate_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_20_modules_intermediate_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_20_modules_intermediate_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_20_modules_intermediate_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_20_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_20_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_20_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_20_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_20_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_20_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_20_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_20_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_self_modules_query_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_self_modules_query_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_self_modules_query_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_self_modules_query_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_self_modules_key_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_self_modules_key_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_self_modules_key_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_self_modules_key_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_self_modules_value_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_self_modules_value_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_self_modules_value_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_self_modules_value_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_21_modules_intermediate_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_21_modules_intermediate_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_21_modules_intermediate_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_21_modules_intermediate_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_21_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_21_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_21_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_21_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_21_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_21_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_21_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_21_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_self_modules_query_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_self_modules_query_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_self_modules_query_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_self_modules_query_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_self_modules_key_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_self_modules_key_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_self_modules_key_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_self_modules_key_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_self_modules_value_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_self_modules_value_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_self_modules_value_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_self_modules_value_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_22_modules_intermediate_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_22_modules_intermediate_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_22_modules_intermediate_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_22_modules_intermediate_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_22_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_22_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_22_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_22_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_22_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_22_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_22_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_22_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_self_modules_query_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_self_modules_query_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_self_modules_query_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_self_modules_query_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_self_modules_key_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_self_modules_key_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_self_modules_key_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_self_modules_key_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_self_modules_value_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_self_modules_value_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_self_modules_value_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_self_modules_value_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_23_modules_intermediate_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_23_modules_intermediate_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_23_modules_intermediate_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_23_modules_intermediate_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_23_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_23_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_23_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_23_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_23_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_23_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_23_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_23_modules_output_modules_LayerNorm_parameters_bias_ + extended_attention_mask = l_attention_mask_[ + (slice(None, None, None), None, None, slice(None, None, None)) + ] + l_attention_mask_ = None + extended_attention_mask_1 = extended_attention_mask.to(dtype=torch.float32) + extended_attention_mask = None + sub = 1.0 - extended_attention_mask_1 + extended_attention_mask_1 = None + extended_attention_mask_2 = sub * -3.4028234663852886e38 + sub = None + position_ids = l_self_modules_embeddings_buffers_position_ids_[ + (slice(None, None, None), slice(0, 20, None)) + ] + l_self_modules_embeddings_buffers_position_ids_ = None + inputs_embeds = torch.nn.functional.embedding( + l_input_ids_, + l_self_modules_embeddings_modules_word_embeddings_parameters_weight_, + 0, + None, + 2.0, + False, + False, + ) + l_input_ids_ = ( + l_self_modules_embeddings_modules_word_embeddings_parameters_weight_ + ) = None + token_type_embeddings = torch.nn.functional.embedding( + l_token_type_ids_, + l_self_modules_embeddings_modules_token_type_embeddings_parameters_weight_, + None, + None, + 2.0, + False, + False, + ) + l_token_type_ids_ = ( + l_self_modules_embeddings_modules_token_type_embeddings_parameters_weight_ + ) = None + embeddings = inputs_embeds + token_type_embeddings + inputs_embeds = token_type_embeddings = None + position_embeddings = torch.nn.functional.embedding( + position_ids, + l_self_modules_embeddings_modules_position_embeddings_parameters_weight_, + None, + None, + 2.0, + False, + False, + ) + position_ids = ( + l_self_modules_embeddings_modules_position_embeddings_parameters_weight_ + ) = None + embeddings += position_embeddings + embeddings_1 = embeddings + embeddings = position_embeddings = None + embeddings_2 = torch.nn.functional.layer_norm( + embeddings_1, + (1024,), + l_self_modules_embeddings_modules_layer_norm_parameters_weight_, + l_self_modules_embeddings_modules_layer_norm_parameters_bias_, + 1e-12, + ) + embeddings_1 = ( + l_self_modules_embeddings_modules_layer_norm_parameters_weight_ + ) = l_self_modules_embeddings_modules_layer_norm_parameters_bias_ = None + embeddings_3 = torch.nn.functional.dropout(embeddings_2, 0.1, False, False) + embeddings_2 = None + query_layer = torch._C._nn.linear( + embeddings_3, + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_query_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_query_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_query_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_query_parameters_bias_ = (None) + view = query_layer.view(1, -1, 16, 64) + query_layer = None + query_layer_1 = view.transpose(1, 2) + view = None + key_layer = torch._C._nn.linear( + embeddings_3, + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_key_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_key_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_key_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_key_parameters_bias_ = (None) + view_1 = key_layer.view(1, -1, 16, 64) + key_layer = None + key_layer_1 = view_1.transpose(1, 2) + view_1 = None + value_layer = torch._C._nn.linear( + embeddings_3, + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_value_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_value_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_value_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_value_parameters_bias_ = (None) + view_2 = value_layer.view(1, -1, 16, 64) + value_layer = None + value_layer_1 = view_2.transpose(1, 2) + view_2 = None + transpose_3 = key_layer_1.transpose(-1, -2) + key_layer_1 = None + attention_scores = torch.matmul(query_layer_1, transpose_3) + query_layer_1 = transpose_3 = None + attention_scores_1 = attention_scores / 8.0 + attention_scores = None + attention_scores_2 = attention_scores_1 + extended_attention_mask_2 + attention_scores_1 = None + attention_probs = torch.nn.functional.softmax(attention_scores_2, dim=-1) + attention_scores_2 = None + attention_probs_1 = torch.nn.functional.dropout( + attention_probs, 0.1, False, False + ) + attention_probs = None + context_layer = torch.matmul(attention_probs_1, value_layer_1) + attention_probs_1 = value_layer_1 = None + permute = context_layer.permute(0, 2, 1, 3) + context_layer = None + context_layer_1 = permute.contiguous() + permute = None + context_layer_2 = context_layer_1.view((1, 20, 1024)) + context_layer_1 = None + hidden_states = torch._C._nn.linear( + context_layer_2, + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dense_parameters_bias_, + ) + context_layer_2 = l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_1 = torch.nn.functional.dropout(hidden_states, 0.1, False, False) + hidden_states = None + add_2 = hidden_states_1 + embeddings_3 + hidden_states_1 = embeddings_3 = None + hidden_states_2 = torch.nn.functional.layer_norm( + add_2, + (1024,), + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_layer_norm_parameters_bias_, + 1e-12, + ) + add_2 = l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = (None) + hidden_states_3 = torch._C._nn.linear( + hidden_states_2, + l_self_modules_encoder_modules_layer_modules_0_modules_intermediate_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_0_modules_intermediate_modules_dense_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_0_modules_intermediate_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_0_modules_intermediate_modules_dense_parameters_bias_ = (None) + hidden_states_4 = torch._C._nn.gelu(hidden_states_3) + hidden_states_3 = None + hidden_states_5 = torch._C._nn.linear( + hidden_states_4, + l_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dense_parameters_bias_, + ) + hidden_states_4 = l_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_6 = torch.nn.functional.dropout( + hidden_states_5, 0.1, False, False + ) + hidden_states_5 = None + add_3 = hidden_states_6 + hidden_states_2 + hidden_states_6 = hidden_states_2 = None + hidden_states_7 = torch.nn.functional.layer_norm( + add_3, + (1024,), + l_self_modules_encoder_modules_layer_modules_0_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_0_modules_output_modules_layer_norm_parameters_bias_, + 1e-12, + ) + add_3 = l_self_modules_encoder_modules_layer_modules_0_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_0_modules_output_modules_layer_norm_parameters_bias_ = (None) + query_layer_2 = torch._C._nn.linear( + hidden_states_7, + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_query_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_query_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_query_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_query_parameters_bias_ = (None) + view_4 = query_layer_2.view(1, -1, 16, 64) + query_layer_2 = None + query_layer_3 = view_4.transpose(1, 2) + view_4 = None + key_layer_2 = torch._C._nn.linear( + hidden_states_7, + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_key_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_key_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_key_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_key_parameters_bias_ = (None) + view_5 = key_layer_2.view(1, -1, 16, 64) + key_layer_2 = None + key_layer_3 = view_5.transpose(1, 2) + view_5 = None + value_layer_2 = torch._C._nn.linear( + hidden_states_7, + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_value_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_value_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_value_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_value_parameters_bias_ = (None) + view_6 = value_layer_2.view(1, -1, 16, 64) + value_layer_2 = None + value_layer_3 = view_6.transpose(1, 2) + view_6 = None + transpose_7 = key_layer_3.transpose(-1, -2) + key_layer_3 = None + attention_scores_3 = torch.matmul(query_layer_3, transpose_7) + query_layer_3 = transpose_7 = None + attention_scores_4 = attention_scores_3 / 8.0 + attention_scores_3 = None + attention_scores_5 = attention_scores_4 + extended_attention_mask_2 + attention_scores_4 = None + attention_probs_2 = torch.nn.functional.softmax(attention_scores_5, dim=-1) + attention_scores_5 = None + attention_probs_3 = torch.nn.functional.dropout( + attention_probs_2, 0.1, False, False + ) + attention_probs_2 = None + context_layer_3 = torch.matmul(attention_probs_3, value_layer_3) + attention_probs_3 = value_layer_3 = None + permute_1 = context_layer_3.permute(0, 2, 1, 3) + context_layer_3 = None + context_layer_4 = permute_1.contiguous() + permute_1 = None + context_layer_5 = context_layer_4.view((1, 20, 1024)) + context_layer_4 = None + hidden_states_8 = torch._C._nn.linear( + context_layer_5, + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dense_parameters_bias_, + ) + context_layer_5 = l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_9 = torch.nn.functional.dropout( + hidden_states_8, 0.1, False, False + ) + hidden_states_8 = None + add_5 = hidden_states_9 + hidden_states_7 + hidden_states_9 = hidden_states_7 = None + hidden_states_10 = torch.nn.functional.layer_norm( + add_5, + (1024,), + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_layer_norm_parameters_bias_, + 1e-12, + ) + add_5 = l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = (None) + hidden_states_11 = torch._C._nn.linear( + hidden_states_10, + l_self_modules_encoder_modules_layer_modules_1_modules_intermediate_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_1_modules_intermediate_modules_dense_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_1_modules_intermediate_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_1_modules_intermediate_modules_dense_parameters_bias_ = (None) + hidden_states_12 = torch._C._nn.gelu(hidden_states_11) + hidden_states_11 = None + hidden_states_13 = torch._C._nn.linear( + hidden_states_12, + l_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dense_parameters_bias_, + ) + hidden_states_12 = l_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_14 = torch.nn.functional.dropout( + hidden_states_13, 0.1, False, False + ) + hidden_states_13 = None + add_6 = hidden_states_14 + hidden_states_10 + hidden_states_14 = hidden_states_10 = None + hidden_states_15 = torch.nn.functional.layer_norm( + add_6, + (1024,), + l_self_modules_encoder_modules_layer_modules_1_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_1_modules_output_modules_layer_norm_parameters_bias_, + 1e-12, + ) + add_6 = l_self_modules_encoder_modules_layer_modules_1_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_1_modules_output_modules_layer_norm_parameters_bias_ = (None) + query_layer_4 = torch._C._nn.linear( + hidden_states_15, + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_query_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_query_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_query_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_query_parameters_bias_ = (None) + view_8 = query_layer_4.view(1, -1, 16, 64) + query_layer_4 = None + query_layer_5 = view_8.transpose(1, 2) + view_8 = None + key_layer_4 = torch._C._nn.linear( + hidden_states_15, + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_key_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_key_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_key_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_key_parameters_bias_ = (None) + view_9 = key_layer_4.view(1, -1, 16, 64) + key_layer_4 = None + key_layer_5 = view_9.transpose(1, 2) + view_9 = None + value_layer_4 = torch._C._nn.linear( + hidden_states_15, + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_value_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_value_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_value_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_value_parameters_bias_ = (None) + view_10 = value_layer_4.view(1, -1, 16, 64) + value_layer_4 = None + value_layer_5 = view_10.transpose(1, 2) + view_10 = None + transpose_11 = key_layer_5.transpose(-1, -2) + key_layer_5 = None + attention_scores_6 = torch.matmul(query_layer_5, transpose_11) + query_layer_5 = transpose_11 = None + attention_scores_7 = attention_scores_6 / 8.0 + attention_scores_6 = None + attention_scores_8 = attention_scores_7 + extended_attention_mask_2 + attention_scores_7 = None + attention_probs_4 = torch.nn.functional.softmax(attention_scores_8, dim=-1) + attention_scores_8 = None + attention_probs_5 = torch.nn.functional.dropout( + attention_probs_4, 0.1, False, False + ) + attention_probs_4 = None + context_layer_6 = torch.matmul(attention_probs_5, value_layer_5) + attention_probs_5 = value_layer_5 = None + permute_2 = context_layer_6.permute(0, 2, 1, 3) + context_layer_6 = None + context_layer_7 = permute_2.contiguous() + permute_2 = None + context_layer_8 = context_layer_7.view((1, 20, 1024)) + context_layer_7 = None + hidden_states_16 = torch._C._nn.linear( + context_layer_8, + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_dense_parameters_bias_, + ) + context_layer_8 = l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_17 = torch.nn.functional.dropout( + hidden_states_16, 0.1, False, False + ) + hidden_states_16 = None + add_8 = hidden_states_17 + hidden_states_15 + hidden_states_17 = hidden_states_15 = None + hidden_states_18 = torch.nn.functional.layer_norm( + add_8, + (1024,), + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_layer_norm_parameters_bias_, + 1e-12, + ) + add_8 = l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = (None) + hidden_states_19 = torch._C._nn.linear( + hidden_states_18, + l_self_modules_encoder_modules_layer_modules_2_modules_intermediate_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_2_modules_intermediate_modules_dense_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_2_modules_intermediate_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_2_modules_intermediate_modules_dense_parameters_bias_ = (None) + hidden_states_20 = torch._C._nn.gelu(hidden_states_19) + hidden_states_19 = None + hidden_states_21 = torch._C._nn.linear( + hidden_states_20, + l_self_modules_encoder_modules_layer_modules_2_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_2_modules_output_modules_dense_parameters_bias_, + ) + hidden_states_20 = l_self_modules_encoder_modules_layer_modules_2_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_2_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_22 = torch.nn.functional.dropout( + hidden_states_21, 0.1, False, False + ) + hidden_states_21 = None + add_9 = hidden_states_22 + hidden_states_18 + hidden_states_22 = hidden_states_18 = None + hidden_states_23 = torch.nn.functional.layer_norm( + add_9, + (1024,), + l_self_modules_encoder_modules_layer_modules_2_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_2_modules_output_modules_layer_norm_parameters_bias_, + 1e-12, + ) + add_9 = l_self_modules_encoder_modules_layer_modules_2_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_2_modules_output_modules_layer_norm_parameters_bias_ = (None) + query_layer_6 = torch._C._nn.linear( + hidden_states_23, + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_query_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_query_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_query_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_query_parameters_bias_ = (None) + view_12 = query_layer_6.view(1, -1, 16, 64) + query_layer_6 = None + query_layer_7 = view_12.transpose(1, 2) + view_12 = None + key_layer_6 = torch._C._nn.linear( + hidden_states_23, + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_key_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_key_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_key_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_key_parameters_bias_ = (None) + view_13 = key_layer_6.view(1, -1, 16, 64) + key_layer_6 = None + key_layer_7 = view_13.transpose(1, 2) + view_13 = None + value_layer_6 = torch._C._nn.linear( + hidden_states_23, + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_value_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_value_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_value_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_value_parameters_bias_ = (None) + view_14 = value_layer_6.view(1, -1, 16, 64) + value_layer_6 = None + value_layer_7 = view_14.transpose(1, 2) + view_14 = None + transpose_15 = key_layer_7.transpose(-1, -2) + key_layer_7 = None + attention_scores_9 = torch.matmul(query_layer_7, transpose_15) + query_layer_7 = transpose_15 = None + attention_scores_10 = attention_scores_9 / 8.0 + attention_scores_9 = None + attention_scores_11 = attention_scores_10 + extended_attention_mask_2 + attention_scores_10 = None + attention_probs_6 = torch.nn.functional.softmax(attention_scores_11, dim=-1) + attention_scores_11 = None + attention_probs_7 = torch.nn.functional.dropout( + attention_probs_6, 0.1, False, False + ) + attention_probs_6 = None + context_layer_9 = torch.matmul(attention_probs_7, value_layer_7) + attention_probs_7 = value_layer_7 = None + permute_3 = context_layer_9.permute(0, 2, 1, 3) + context_layer_9 = None + context_layer_10 = permute_3.contiguous() + permute_3 = None + context_layer_11 = context_layer_10.view((1, 20, 1024)) + context_layer_10 = None + hidden_states_24 = torch._C._nn.linear( + context_layer_11, + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_dense_parameters_bias_, + ) + context_layer_11 = l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_25 = torch.nn.functional.dropout( + hidden_states_24, 0.1, False, False + ) + hidden_states_24 = None + add_11 = hidden_states_25 + hidden_states_23 + hidden_states_25 = hidden_states_23 = None + hidden_states_26 = torch.nn.functional.layer_norm( + add_11, + (1024,), + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_layer_norm_parameters_bias_, + 1e-12, + ) + add_11 = l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = (None) + hidden_states_27 = torch._C._nn.linear( + hidden_states_26, + l_self_modules_encoder_modules_layer_modules_3_modules_intermediate_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_3_modules_intermediate_modules_dense_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_3_modules_intermediate_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_3_modules_intermediate_modules_dense_parameters_bias_ = (None) + hidden_states_28 = torch._C._nn.gelu(hidden_states_27) + hidden_states_27 = None + hidden_states_29 = torch._C._nn.linear( + hidden_states_28, + l_self_modules_encoder_modules_layer_modules_3_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_3_modules_output_modules_dense_parameters_bias_, + ) + hidden_states_28 = l_self_modules_encoder_modules_layer_modules_3_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_3_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_30 = torch.nn.functional.dropout( + hidden_states_29, 0.1, False, False + ) + hidden_states_29 = None + add_12 = hidden_states_30 + hidden_states_26 + hidden_states_30 = hidden_states_26 = None + hidden_states_31 = torch.nn.functional.layer_norm( + add_12, + (1024,), + l_self_modules_encoder_modules_layer_modules_3_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_3_modules_output_modules_layer_norm_parameters_bias_, + 1e-12, + ) + add_12 = l_self_modules_encoder_modules_layer_modules_3_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_3_modules_output_modules_layer_norm_parameters_bias_ = (None) + query_layer_8 = torch._C._nn.linear( + hidden_states_31, + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_query_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_query_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_query_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_query_parameters_bias_ = (None) + view_16 = query_layer_8.view(1, -1, 16, 64) + query_layer_8 = None + query_layer_9 = view_16.transpose(1, 2) + view_16 = None + key_layer_8 = torch._C._nn.linear( + hidden_states_31, + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_key_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_key_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_key_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_key_parameters_bias_ = (None) + view_17 = key_layer_8.view(1, -1, 16, 64) + key_layer_8 = None + key_layer_9 = view_17.transpose(1, 2) + view_17 = None + value_layer_8 = torch._C._nn.linear( + hidden_states_31, + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_value_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_value_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_value_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_value_parameters_bias_ = (None) + view_18 = value_layer_8.view(1, -1, 16, 64) + value_layer_8 = None + value_layer_9 = view_18.transpose(1, 2) + view_18 = None + transpose_19 = key_layer_9.transpose(-1, -2) + key_layer_9 = None + attention_scores_12 = torch.matmul(query_layer_9, transpose_19) + query_layer_9 = transpose_19 = None + attention_scores_13 = attention_scores_12 / 8.0 + attention_scores_12 = None + attention_scores_14 = attention_scores_13 + extended_attention_mask_2 + attention_scores_13 = None + attention_probs_8 = torch.nn.functional.softmax(attention_scores_14, dim=-1) + attention_scores_14 = None + attention_probs_9 = torch.nn.functional.dropout( + attention_probs_8, 0.1, False, False + ) + attention_probs_8 = None + context_layer_12 = torch.matmul(attention_probs_9, value_layer_9) + attention_probs_9 = value_layer_9 = None + permute_4 = context_layer_12.permute(0, 2, 1, 3) + context_layer_12 = None + context_layer_13 = permute_4.contiguous() + permute_4 = None + context_layer_14 = context_layer_13.view((1, 20, 1024)) + context_layer_13 = None + hidden_states_32 = torch._C._nn.linear( + context_layer_14, + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_dense_parameters_bias_, + ) + context_layer_14 = l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_33 = torch.nn.functional.dropout( + hidden_states_32, 0.1, False, False + ) + hidden_states_32 = None + add_14 = hidden_states_33 + hidden_states_31 + hidden_states_33 = hidden_states_31 = None + hidden_states_34 = torch.nn.functional.layer_norm( + add_14, + (1024,), + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_layer_norm_parameters_bias_, + 1e-12, + ) + add_14 = l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = (None) + hidden_states_35 = torch._C._nn.linear( + hidden_states_34, + l_self_modules_encoder_modules_layer_modules_4_modules_intermediate_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_4_modules_intermediate_modules_dense_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_4_modules_intermediate_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_4_modules_intermediate_modules_dense_parameters_bias_ = (None) + hidden_states_36 = torch._C._nn.gelu(hidden_states_35) + hidden_states_35 = None + hidden_states_37 = torch._C._nn.linear( + hidden_states_36, + l_self_modules_encoder_modules_layer_modules_4_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_4_modules_output_modules_dense_parameters_bias_, + ) + hidden_states_36 = l_self_modules_encoder_modules_layer_modules_4_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_4_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_38 = torch.nn.functional.dropout( + hidden_states_37, 0.1, False, False + ) + hidden_states_37 = None + add_15 = hidden_states_38 + hidden_states_34 + hidden_states_38 = hidden_states_34 = None + hidden_states_39 = torch.nn.functional.layer_norm( + add_15, + (1024,), + l_self_modules_encoder_modules_layer_modules_4_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_4_modules_output_modules_layer_norm_parameters_bias_, + 1e-12, + ) + add_15 = l_self_modules_encoder_modules_layer_modules_4_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_4_modules_output_modules_layer_norm_parameters_bias_ = (None) + query_layer_10 = torch._C._nn.linear( + hidden_states_39, + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_query_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_query_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_query_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_query_parameters_bias_ = (None) + view_20 = query_layer_10.view(1, -1, 16, 64) + query_layer_10 = None + query_layer_11 = view_20.transpose(1, 2) + view_20 = None + key_layer_10 = torch._C._nn.linear( + hidden_states_39, + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_key_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_key_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_key_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_key_parameters_bias_ = (None) + view_21 = key_layer_10.view(1, -1, 16, 64) + key_layer_10 = None + key_layer_11 = view_21.transpose(1, 2) + view_21 = None + value_layer_10 = torch._C._nn.linear( + hidden_states_39, + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_value_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_value_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_value_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_value_parameters_bias_ = (None) + view_22 = value_layer_10.view(1, -1, 16, 64) + value_layer_10 = None + value_layer_11 = view_22.transpose(1, 2) + view_22 = None + transpose_23 = key_layer_11.transpose(-1, -2) + key_layer_11 = None + attention_scores_15 = torch.matmul(query_layer_11, transpose_23) + query_layer_11 = transpose_23 = None + attention_scores_16 = attention_scores_15 / 8.0 + attention_scores_15 = None + attention_scores_17 = attention_scores_16 + extended_attention_mask_2 + attention_scores_16 = None + attention_probs_10 = torch.nn.functional.softmax(attention_scores_17, dim=-1) + attention_scores_17 = None + attention_probs_11 = torch.nn.functional.dropout( + attention_probs_10, 0.1, False, False + ) + attention_probs_10 = None + context_layer_15 = torch.matmul(attention_probs_11, value_layer_11) + attention_probs_11 = value_layer_11 = None + permute_5 = context_layer_15.permute(0, 2, 1, 3) + context_layer_15 = None + context_layer_16 = permute_5.contiguous() + permute_5 = None + context_layer_17 = context_layer_16.view((1, 20, 1024)) + context_layer_16 = None + hidden_states_40 = torch._C._nn.linear( + context_layer_17, + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_dense_parameters_bias_, + ) + context_layer_17 = l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_41 = torch.nn.functional.dropout( + hidden_states_40, 0.1, False, False + ) + hidden_states_40 = None + add_17 = hidden_states_41 + hidden_states_39 + hidden_states_41 = hidden_states_39 = None + hidden_states_42 = torch.nn.functional.layer_norm( + add_17, + (1024,), + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_layer_norm_parameters_bias_, + 1e-12, + ) + add_17 = l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = (None) + hidden_states_43 = torch._C._nn.linear( + hidden_states_42, + l_self_modules_encoder_modules_layer_modules_5_modules_intermediate_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_5_modules_intermediate_modules_dense_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_5_modules_intermediate_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_5_modules_intermediate_modules_dense_parameters_bias_ = (None) + hidden_states_44 = torch._C._nn.gelu(hidden_states_43) + hidden_states_43 = None + hidden_states_45 = torch._C._nn.linear( + hidden_states_44, + l_self_modules_encoder_modules_layer_modules_5_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_5_modules_output_modules_dense_parameters_bias_, + ) + hidden_states_44 = l_self_modules_encoder_modules_layer_modules_5_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_5_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_46 = torch.nn.functional.dropout( + hidden_states_45, 0.1, False, False + ) + hidden_states_45 = None + add_18 = hidden_states_46 + hidden_states_42 + hidden_states_46 = hidden_states_42 = None + hidden_states_47 = torch.nn.functional.layer_norm( + add_18, + (1024,), + l_self_modules_encoder_modules_layer_modules_5_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_5_modules_output_modules_layer_norm_parameters_bias_, + 1e-12, + ) + add_18 = l_self_modules_encoder_modules_layer_modules_5_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_5_modules_output_modules_layer_norm_parameters_bias_ = (None) + query_layer_12 = torch._C._nn.linear( + hidden_states_47, + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_query_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_query_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_query_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_query_parameters_bias_ = (None) + view_24 = query_layer_12.view(1, -1, 16, 64) + query_layer_12 = None + query_layer_13 = view_24.transpose(1, 2) + view_24 = None + key_layer_12 = torch._C._nn.linear( + hidden_states_47, + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_key_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_key_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_key_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_key_parameters_bias_ = (None) + view_25 = key_layer_12.view(1, -1, 16, 64) + key_layer_12 = None + key_layer_13 = view_25.transpose(1, 2) + view_25 = None + value_layer_12 = torch._C._nn.linear( + hidden_states_47, + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_value_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_value_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_value_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_value_parameters_bias_ = (None) + view_26 = value_layer_12.view(1, -1, 16, 64) + value_layer_12 = None + value_layer_13 = view_26.transpose(1, 2) + view_26 = None + transpose_27 = key_layer_13.transpose(-1, -2) + key_layer_13 = None + attention_scores_18 = torch.matmul(query_layer_13, transpose_27) + query_layer_13 = transpose_27 = None + attention_scores_19 = attention_scores_18 / 8.0 + attention_scores_18 = None + attention_scores_20 = attention_scores_19 + extended_attention_mask_2 + attention_scores_19 = None + attention_probs_12 = torch.nn.functional.softmax(attention_scores_20, dim=-1) + attention_scores_20 = None + attention_probs_13 = torch.nn.functional.dropout( + attention_probs_12, 0.1, False, False + ) + attention_probs_12 = None + context_layer_18 = torch.matmul(attention_probs_13, value_layer_13) + attention_probs_13 = value_layer_13 = None + permute_6 = context_layer_18.permute(0, 2, 1, 3) + context_layer_18 = None + context_layer_19 = permute_6.contiguous() + permute_6 = None + context_layer_20 = context_layer_19.view((1, 20, 1024)) + context_layer_19 = None + hidden_states_48 = torch._C._nn.linear( + context_layer_20, + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_dense_parameters_bias_, + ) + context_layer_20 = l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_49 = torch.nn.functional.dropout( + hidden_states_48, 0.1, False, False + ) + hidden_states_48 = None + add_20 = hidden_states_49 + hidden_states_47 + hidden_states_49 = hidden_states_47 = None + hidden_states_50 = torch.nn.functional.layer_norm( + add_20, + (1024,), + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_layer_norm_parameters_bias_, + 1e-12, + ) + add_20 = l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = (None) + hidden_states_51 = torch._C._nn.linear( + hidden_states_50, + l_self_modules_encoder_modules_layer_modules_6_modules_intermediate_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_6_modules_intermediate_modules_dense_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_6_modules_intermediate_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_6_modules_intermediate_modules_dense_parameters_bias_ = (None) + hidden_states_52 = torch._C._nn.gelu(hidden_states_51) + hidden_states_51 = None + hidden_states_53 = torch._C._nn.linear( + hidden_states_52, + l_self_modules_encoder_modules_layer_modules_6_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_6_modules_output_modules_dense_parameters_bias_, + ) + hidden_states_52 = l_self_modules_encoder_modules_layer_modules_6_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_6_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_54 = torch.nn.functional.dropout( + hidden_states_53, 0.1, False, False + ) + hidden_states_53 = None + add_21 = hidden_states_54 + hidden_states_50 + hidden_states_54 = hidden_states_50 = None + hidden_states_55 = torch.nn.functional.layer_norm( + add_21, + (1024,), + l_self_modules_encoder_modules_layer_modules_6_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_6_modules_output_modules_layer_norm_parameters_bias_, + 1e-12, + ) + add_21 = l_self_modules_encoder_modules_layer_modules_6_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_6_modules_output_modules_layer_norm_parameters_bias_ = (None) + query_layer_14 = torch._C._nn.linear( + hidden_states_55, + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_query_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_query_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_query_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_query_parameters_bias_ = (None) + view_28 = query_layer_14.view(1, -1, 16, 64) + query_layer_14 = None + query_layer_15 = view_28.transpose(1, 2) + view_28 = None + key_layer_14 = torch._C._nn.linear( + hidden_states_55, + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_key_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_key_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_key_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_key_parameters_bias_ = (None) + view_29 = key_layer_14.view(1, -1, 16, 64) + key_layer_14 = None + key_layer_15 = view_29.transpose(1, 2) + view_29 = None + value_layer_14 = torch._C._nn.linear( + hidden_states_55, + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_value_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_value_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_value_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_value_parameters_bias_ = (None) + view_30 = value_layer_14.view(1, -1, 16, 64) + value_layer_14 = None + value_layer_15 = view_30.transpose(1, 2) + view_30 = None + transpose_31 = key_layer_15.transpose(-1, -2) + key_layer_15 = None + attention_scores_21 = torch.matmul(query_layer_15, transpose_31) + query_layer_15 = transpose_31 = None + attention_scores_22 = attention_scores_21 / 8.0 + attention_scores_21 = None + attention_scores_23 = attention_scores_22 + extended_attention_mask_2 + attention_scores_22 = None + attention_probs_14 = torch.nn.functional.softmax(attention_scores_23, dim=-1) + attention_scores_23 = None + attention_probs_15 = torch.nn.functional.dropout( + attention_probs_14, 0.1, False, False + ) + attention_probs_14 = None + context_layer_21 = torch.matmul(attention_probs_15, value_layer_15) + attention_probs_15 = value_layer_15 = None + permute_7 = context_layer_21.permute(0, 2, 1, 3) + context_layer_21 = None + context_layer_22 = permute_7.contiguous() + permute_7 = None + context_layer_23 = context_layer_22.view((1, 20, 1024)) + context_layer_22 = None + hidden_states_56 = torch._C._nn.linear( + context_layer_23, + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_dense_parameters_bias_, + ) + context_layer_23 = l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_57 = torch.nn.functional.dropout( + hidden_states_56, 0.1, False, False + ) + hidden_states_56 = None + add_23 = hidden_states_57 + hidden_states_55 + hidden_states_57 = hidden_states_55 = None + hidden_states_58 = torch.nn.functional.layer_norm( + add_23, + (1024,), + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_layer_norm_parameters_bias_, + 1e-12, + ) + add_23 = l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = (None) + hidden_states_59 = torch._C._nn.linear( + hidden_states_58, + l_self_modules_encoder_modules_layer_modules_7_modules_intermediate_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_7_modules_intermediate_modules_dense_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_7_modules_intermediate_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_7_modules_intermediate_modules_dense_parameters_bias_ = (None) + hidden_states_60 = torch._C._nn.gelu(hidden_states_59) + hidden_states_59 = None + hidden_states_61 = torch._C._nn.linear( + hidden_states_60, + l_self_modules_encoder_modules_layer_modules_7_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_7_modules_output_modules_dense_parameters_bias_, + ) + hidden_states_60 = l_self_modules_encoder_modules_layer_modules_7_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_7_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_62 = torch.nn.functional.dropout( + hidden_states_61, 0.1, False, False + ) + hidden_states_61 = None + add_24 = hidden_states_62 + hidden_states_58 + hidden_states_62 = hidden_states_58 = None + hidden_states_63 = torch.nn.functional.layer_norm( + add_24, + (1024,), + l_self_modules_encoder_modules_layer_modules_7_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_7_modules_output_modules_layer_norm_parameters_bias_, + 1e-12, + ) + add_24 = l_self_modules_encoder_modules_layer_modules_7_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_7_modules_output_modules_layer_norm_parameters_bias_ = (None) + query_layer_16 = torch._C._nn.linear( + hidden_states_63, + l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_query_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_query_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_query_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_query_parameters_bias_ = (None) + view_32 = query_layer_16.view(1, -1, 16, 64) + query_layer_16 = None + query_layer_17 = view_32.transpose(1, 2) + view_32 = None + key_layer_16 = torch._C._nn.linear( + hidden_states_63, + l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_key_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_key_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_key_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_key_parameters_bias_ = (None) + view_33 = key_layer_16.view(1, -1, 16, 64) + key_layer_16 = None + key_layer_17 = view_33.transpose(1, 2) + view_33 = None + value_layer_16 = torch._C._nn.linear( + hidden_states_63, + l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_value_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_value_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_value_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_value_parameters_bias_ = (None) + view_34 = value_layer_16.view(1, -1, 16, 64) + value_layer_16 = None + value_layer_17 = view_34.transpose(1, 2) + view_34 = None + transpose_35 = key_layer_17.transpose(-1, -2) + key_layer_17 = None + attention_scores_24 = torch.matmul(query_layer_17, transpose_35) + query_layer_17 = transpose_35 = None + attention_scores_25 = attention_scores_24 / 8.0 + attention_scores_24 = None + attention_scores_26 = attention_scores_25 + extended_attention_mask_2 + attention_scores_25 = None + attention_probs_16 = torch.nn.functional.softmax(attention_scores_26, dim=-1) + attention_scores_26 = None + attention_probs_17 = torch.nn.functional.dropout( + attention_probs_16, 0.1, False, False + ) + attention_probs_16 = None + context_layer_24 = torch.matmul(attention_probs_17, value_layer_17) + attention_probs_17 = value_layer_17 = None + permute_8 = context_layer_24.permute(0, 2, 1, 3) + context_layer_24 = None + context_layer_25 = permute_8.contiguous() + permute_8 = None + context_layer_26 = context_layer_25.view((1, 20, 1024)) + context_layer_25 = None + hidden_states_64 = torch._C._nn.linear( + context_layer_26, + l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_dense_parameters_bias_, + ) + context_layer_26 = l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_65 = torch.nn.functional.dropout( + hidden_states_64, 0.1, False, False + ) + hidden_states_64 = None + add_26 = hidden_states_65 + hidden_states_63 + hidden_states_65 = hidden_states_63 = None + hidden_states_66 = torch.nn.functional.layer_norm( + add_26, + (1024,), + l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_layer_norm_parameters_bias_, + 1e-12, + ) + add_26 = l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = (None) + hidden_states_67 = torch._C._nn.linear( + hidden_states_66, + l_self_modules_encoder_modules_layer_modules_8_modules_intermediate_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_8_modules_intermediate_modules_dense_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_8_modules_intermediate_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_8_modules_intermediate_modules_dense_parameters_bias_ = (None) + hidden_states_68 = torch._C._nn.gelu(hidden_states_67) + hidden_states_67 = None + hidden_states_69 = torch._C._nn.linear( + hidden_states_68, + l_self_modules_encoder_modules_layer_modules_8_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_8_modules_output_modules_dense_parameters_bias_, + ) + hidden_states_68 = l_self_modules_encoder_modules_layer_modules_8_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_8_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_70 = torch.nn.functional.dropout( + hidden_states_69, 0.1, False, False + ) + hidden_states_69 = None + add_27 = hidden_states_70 + hidden_states_66 + hidden_states_70 = hidden_states_66 = None + hidden_states_71 = torch.nn.functional.layer_norm( + add_27, + (1024,), + l_self_modules_encoder_modules_layer_modules_8_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_8_modules_output_modules_layer_norm_parameters_bias_, + 1e-12, + ) + add_27 = l_self_modules_encoder_modules_layer_modules_8_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_8_modules_output_modules_layer_norm_parameters_bias_ = (None) + query_layer_18 = torch._C._nn.linear( + hidden_states_71, + l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_query_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_query_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_query_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_query_parameters_bias_ = (None) + view_36 = query_layer_18.view(1, -1, 16, 64) + query_layer_18 = None + query_layer_19 = view_36.transpose(1, 2) + view_36 = None + key_layer_18 = torch._C._nn.linear( + hidden_states_71, + l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_key_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_key_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_key_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_key_parameters_bias_ = (None) + view_37 = key_layer_18.view(1, -1, 16, 64) + key_layer_18 = None + key_layer_19 = view_37.transpose(1, 2) + view_37 = None + value_layer_18 = torch._C._nn.linear( + hidden_states_71, + l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_value_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_value_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_value_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_value_parameters_bias_ = (None) + view_38 = value_layer_18.view(1, -1, 16, 64) + value_layer_18 = None + value_layer_19 = view_38.transpose(1, 2) + view_38 = None + transpose_39 = key_layer_19.transpose(-1, -2) + key_layer_19 = None + attention_scores_27 = torch.matmul(query_layer_19, transpose_39) + query_layer_19 = transpose_39 = None + attention_scores_28 = attention_scores_27 / 8.0 + attention_scores_27 = None + attention_scores_29 = attention_scores_28 + extended_attention_mask_2 + attention_scores_28 = None + attention_probs_18 = torch.nn.functional.softmax(attention_scores_29, dim=-1) + attention_scores_29 = None + attention_probs_19 = torch.nn.functional.dropout( + attention_probs_18, 0.1, False, False + ) + attention_probs_18 = None + context_layer_27 = torch.matmul(attention_probs_19, value_layer_19) + attention_probs_19 = value_layer_19 = None + permute_9 = context_layer_27.permute(0, 2, 1, 3) + context_layer_27 = None + context_layer_28 = permute_9.contiguous() + permute_9 = None + context_layer_29 = context_layer_28.view((1, 20, 1024)) + context_layer_28 = None + hidden_states_72 = torch._C._nn.linear( + context_layer_29, + l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_dense_parameters_bias_, + ) + context_layer_29 = l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_73 = torch.nn.functional.dropout( + hidden_states_72, 0.1, False, False + ) + hidden_states_72 = None + add_29 = hidden_states_73 + hidden_states_71 + hidden_states_73 = hidden_states_71 = None + hidden_states_74 = torch.nn.functional.layer_norm( + add_29, + (1024,), + l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_layer_norm_parameters_bias_, + 1e-12, + ) + add_29 = l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = (None) + hidden_states_75 = torch._C._nn.linear( + hidden_states_74, + l_self_modules_encoder_modules_layer_modules_9_modules_intermediate_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_9_modules_intermediate_modules_dense_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_9_modules_intermediate_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_9_modules_intermediate_modules_dense_parameters_bias_ = (None) + hidden_states_76 = torch._C._nn.gelu(hidden_states_75) + hidden_states_75 = None + hidden_states_77 = torch._C._nn.linear( + hidden_states_76, + l_self_modules_encoder_modules_layer_modules_9_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_9_modules_output_modules_dense_parameters_bias_, + ) + hidden_states_76 = l_self_modules_encoder_modules_layer_modules_9_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_9_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_78 = torch.nn.functional.dropout( + hidden_states_77, 0.1, False, False + ) + hidden_states_77 = None + add_30 = hidden_states_78 + hidden_states_74 + hidden_states_78 = hidden_states_74 = None + hidden_states_79 = torch.nn.functional.layer_norm( + add_30, + (1024,), + l_self_modules_encoder_modules_layer_modules_9_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_9_modules_output_modules_layer_norm_parameters_bias_, + 1e-12, + ) + add_30 = l_self_modules_encoder_modules_layer_modules_9_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_9_modules_output_modules_layer_norm_parameters_bias_ = (None) + query_layer_20 = torch._C._nn.linear( + hidden_states_79, + l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_query_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_query_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_query_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_query_parameters_bias_ = (None) + view_40 = query_layer_20.view(1, -1, 16, 64) + query_layer_20 = None + query_layer_21 = view_40.transpose(1, 2) + view_40 = None + key_layer_20 = torch._C._nn.linear( + hidden_states_79, + l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_key_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_key_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_key_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_key_parameters_bias_ = (None) + view_41 = key_layer_20.view(1, -1, 16, 64) + key_layer_20 = None + key_layer_21 = view_41.transpose(1, 2) + view_41 = None + value_layer_20 = torch._C._nn.linear( + hidden_states_79, + l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_value_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_value_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_value_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_value_parameters_bias_ = (None) + view_42 = value_layer_20.view(1, -1, 16, 64) + value_layer_20 = None + value_layer_21 = view_42.transpose(1, 2) + view_42 = None + transpose_43 = key_layer_21.transpose(-1, -2) + key_layer_21 = None + attention_scores_30 = torch.matmul(query_layer_21, transpose_43) + query_layer_21 = transpose_43 = None + attention_scores_31 = attention_scores_30 / 8.0 + attention_scores_30 = None + attention_scores_32 = attention_scores_31 + extended_attention_mask_2 + attention_scores_31 = None + attention_probs_20 = torch.nn.functional.softmax(attention_scores_32, dim=-1) + attention_scores_32 = None + attention_probs_21 = torch.nn.functional.dropout( + attention_probs_20, 0.1, False, False + ) + attention_probs_20 = None + context_layer_30 = torch.matmul(attention_probs_21, value_layer_21) + attention_probs_21 = value_layer_21 = None + permute_10 = context_layer_30.permute(0, 2, 1, 3) + context_layer_30 = None + context_layer_31 = permute_10.contiguous() + permute_10 = None + context_layer_32 = context_layer_31.view((1, 20, 1024)) + context_layer_31 = None + hidden_states_80 = torch._C._nn.linear( + context_layer_32, + l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_dense_parameters_bias_, + ) + context_layer_32 = l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_81 = torch.nn.functional.dropout( + hidden_states_80, 0.1, False, False + ) + hidden_states_80 = None + add_32 = hidden_states_81 + hidden_states_79 + hidden_states_81 = hidden_states_79 = None + hidden_states_82 = torch.nn.functional.layer_norm( + add_32, + (1024,), + l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_layer_norm_parameters_bias_, + 1e-12, + ) + add_32 = l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = (None) + hidden_states_83 = torch._C._nn.linear( + hidden_states_82, + l_self_modules_encoder_modules_layer_modules_10_modules_intermediate_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_10_modules_intermediate_modules_dense_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_10_modules_intermediate_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_10_modules_intermediate_modules_dense_parameters_bias_ = (None) + hidden_states_84 = torch._C._nn.gelu(hidden_states_83) + hidden_states_83 = None + hidden_states_85 = torch._C._nn.linear( + hidden_states_84, + l_self_modules_encoder_modules_layer_modules_10_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_10_modules_output_modules_dense_parameters_bias_, + ) + hidden_states_84 = l_self_modules_encoder_modules_layer_modules_10_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_10_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_86 = torch.nn.functional.dropout( + hidden_states_85, 0.1, False, False + ) + hidden_states_85 = None + add_33 = hidden_states_86 + hidden_states_82 + hidden_states_86 = hidden_states_82 = None + hidden_states_87 = torch.nn.functional.layer_norm( + add_33, + (1024,), + l_self_modules_encoder_modules_layer_modules_10_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_10_modules_output_modules_layer_norm_parameters_bias_, + 1e-12, + ) + add_33 = l_self_modules_encoder_modules_layer_modules_10_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_10_modules_output_modules_layer_norm_parameters_bias_ = (None) + query_layer_22 = torch._C._nn.linear( + hidden_states_87, + l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_query_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_query_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_query_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_query_parameters_bias_ = (None) + view_44 = query_layer_22.view(1, -1, 16, 64) + query_layer_22 = None + query_layer_23 = view_44.transpose(1, 2) + view_44 = None + key_layer_22 = torch._C._nn.linear( + hidden_states_87, + l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_key_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_key_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_key_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_key_parameters_bias_ = (None) + view_45 = key_layer_22.view(1, -1, 16, 64) + key_layer_22 = None + key_layer_23 = view_45.transpose(1, 2) + view_45 = None + value_layer_22 = torch._C._nn.linear( + hidden_states_87, + l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_value_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_value_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_value_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_value_parameters_bias_ = (None) + view_46 = value_layer_22.view(1, -1, 16, 64) + value_layer_22 = None + value_layer_23 = view_46.transpose(1, 2) + view_46 = None + transpose_47 = key_layer_23.transpose(-1, -2) + key_layer_23 = None + attention_scores_33 = torch.matmul(query_layer_23, transpose_47) + query_layer_23 = transpose_47 = None + attention_scores_34 = attention_scores_33 / 8.0 + attention_scores_33 = None + attention_scores_35 = attention_scores_34 + extended_attention_mask_2 + attention_scores_34 = None + attention_probs_22 = torch.nn.functional.softmax(attention_scores_35, dim=-1) + attention_scores_35 = None + attention_probs_23 = torch.nn.functional.dropout( + attention_probs_22, 0.1, False, False + ) + attention_probs_22 = None + context_layer_33 = torch.matmul(attention_probs_23, value_layer_23) + attention_probs_23 = value_layer_23 = None + permute_11 = context_layer_33.permute(0, 2, 1, 3) + context_layer_33 = None + context_layer_34 = permute_11.contiguous() + permute_11 = None + context_layer_35 = context_layer_34.view((1, 20, 1024)) + context_layer_34 = None + hidden_states_88 = torch._C._nn.linear( + context_layer_35, + l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_dense_parameters_bias_, + ) + context_layer_35 = l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_89 = torch.nn.functional.dropout( + hidden_states_88, 0.1, False, False + ) + hidden_states_88 = None + add_35 = hidden_states_89 + hidden_states_87 + hidden_states_89 = hidden_states_87 = None + hidden_states_90 = torch.nn.functional.layer_norm( + add_35, + (1024,), + l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_layer_norm_parameters_bias_, + 1e-12, + ) + add_35 = l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = (None) + hidden_states_91 = torch._C._nn.linear( + hidden_states_90, + l_self_modules_encoder_modules_layer_modules_11_modules_intermediate_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_11_modules_intermediate_modules_dense_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_11_modules_intermediate_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_11_modules_intermediate_modules_dense_parameters_bias_ = (None) + hidden_states_92 = torch._C._nn.gelu(hidden_states_91) + hidden_states_91 = None + hidden_states_93 = torch._C._nn.linear( + hidden_states_92, + l_self_modules_encoder_modules_layer_modules_11_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_11_modules_output_modules_dense_parameters_bias_, + ) + hidden_states_92 = l_self_modules_encoder_modules_layer_modules_11_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_11_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_94 = torch.nn.functional.dropout( + hidden_states_93, 0.1, False, False + ) + hidden_states_93 = None + add_36 = hidden_states_94 + hidden_states_90 + hidden_states_94 = hidden_states_90 = None + hidden_states_95 = torch.nn.functional.layer_norm( + add_36, + (1024,), + l_self_modules_encoder_modules_layer_modules_11_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_11_modules_output_modules_layer_norm_parameters_bias_, + 1e-12, + ) + add_36 = l_self_modules_encoder_modules_layer_modules_11_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_11_modules_output_modules_layer_norm_parameters_bias_ = (None) + query_layer_24 = torch._C._nn.linear( + hidden_states_95, + l_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_self_modules_query_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_self_modules_query_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_self_modules_query_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_self_modules_query_parameters_bias_ = (None) + view_48 = query_layer_24.view(1, -1, 16, 64) + query_layer_24 = None + query_layer_25 = view_48.transpose(1, 2) + view_48 = None + key_layer_24 = torch._C._nn.linear( + hidden_states_95, + l_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_self_modules_key_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_self_modules_key_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_self_modules_key_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_self_modules_key_parameters_bias_ = (None) + view_49 = key_layer_24.view(1, -1, 16, 64) + key_layer_24 = None + key_layer_25 = view_49.transpose(1, 2) + view_49 = None + value_layer_24 = torch._C._nn.linear( + hidden_states_95, + l_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_self_modules_value_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_self_modules_value_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_self_modules_value_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_self_modules_value_parameters_bias_ = (None) + view_50 = value_layer_24.view(1, -1, 16, 64) + value_layer_24 = None + value_layer_25 = view_50.transpose(1, 2) + view_50 = None + transpose_51 = key_layer_25.transpose(-1, -2) + key_layer_25 = None + attention_scores_36 = torch.matmul(query_layer_25, transpose_51) + query_layer_25 = transpose_51 = None + attention_scores_37 = attention_scores_36 / 8.0 + attention_scores_36 = None + attention_scores_38 = attention_scores_37 + extended_attention_mask_2 + attention_scores_37 = None + attention_probs_24 = torch.nn.functional.softmax(attention_scores_38, dim=-1) + attention_scores_38 = None + attention_probs_25 = torch.nn.functional.dropout( + attention_probs_24, 0.1, False, False + ) + attention_probs_24 = None + context_layer_36 = torch.matmul(attention_probs_25, value_layer_25) + attention_probs_25 = value_layer_25 = None + permute_12 = context_layer_36.permute(0, 2, 1, 3) + context_layer_36 = None + context_layer_37 = permute_12.contiguous() + permute_12 = None + context_layer_38 = context_layer_37.view((1, 20, 1024)) + context_layer_37 = None + hidden_states_96 = torch._C._nn.linear( + context_layer_38, + l_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_output_modules_dense_parameters_bias_, + ) + context_layer_38 = l_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_97 = torch.nn.functional.dropout( + hidden_states_96, 0.1, False, False + ) + hidden_states_96 = None + add_38 = hidden_states_97 + hidden_states_95 + hidden_states_97 = hidden_states_95 = None + hidden_states_98 = torch.nn.functional.layer_norm( + add_38, + (1024,), + l_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_output_modules_layer_norm_parameters_bias_, + 1e-12, + ) + add_38 = l_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = (None) + hidden_states_99 = torch._C._nn.linear( + hidden_states_98, + l_self_modules_encoder_modules_layer_modules_12_modules_intermediate_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_12_modules_intermediate_modules_dense_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_12_modules_intermediate_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_12_modules_intermediate_modules_dense_parameters_bias_ = (None) + hidden_states_100 = torch._C._nn.gelu(hidden_states_99) + hidden_states_99 = None + hidden_states_101 = torch._C._nn.linear( + hidden_states_100, + l_self_modules_encoder_modules_layer_modules_12_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_12_modules_output_modules_dense_parameters_bias_, + ) + hidden_states_100 = l_self_modules_encoder_modules_layer_modules_12_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_12_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_102 = torch.nn.functional.dropout( + hidden_states_101, 0.1, False, False + ) + hidden_states_101 = None + add_39 = hidden_states_102 + hidden_states_98 + hidden_states_102 = hidden_states_98 = None + hidden_states_103 = torch.nn.functional.layer_norm( + add_39, + (1024,), + l_self_modules_encoder_modules_layer_modules_12_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_12_modules_output_modules_layer_norm_parameters_bias_, + 1e-12, + ) + add_39 = l_self_modules_encoder_modules_layer_modules_12_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_12_modules_output_modules_layer_norm_parameters_bias_ = (None) + query_layer_26 = torch._C._nn.linear( + hidden_states_103, + l_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_self_modules_query_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_self_modules_query_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_self_modules_query_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_self_modules_query_parameters_bias_ = (None) + view_52 = query_layer_26.view(1, -1, 16, 64) + query_layer_26 = None + query_layer_27 = view_52.transpose(1, 2) + view_52 = None + key_layer_26 = torch._C._nn.linear( + hidden_states_103, + l_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_self_modules_key_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_self_modules_key_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_self_modules_key_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_self_modules_key_parameters_bias_ = (None) + view_53 = key_layer_26.view(1, -1, 16, 64) + key_layer_26 = None + key_layer_27 = view_53.transpose(1, 2) + view_53 = None + value_layer_26 = torch._C._nn.linear( + hidden_states_103, + l_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_self_modules_value_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_self_modules_value_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_self_modules_value_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_self_modules_value_parameters_bias_ = (None) + view_54 = value_layer_26.view(1, -1, 16, 64) + value_layer_26 = None + value_layer_27 = view_54.transpose(1, 2) + view_54 = None + transpose_55 = key_layer_27.transpose(-1, -2) + key_layer_27 = None + attention_scores_39 = torch.matmul(query_layer_27, transpose_55) + query_layer_27 = transpose_55 = None + attention_scores_40 = attention_scores_39 / 8.0 + attention_scores_39 = None + attention_scores_41 = attention_scores_40 + extended_attention_mask_2 + attention_scores_40 = None + attention_probs_26 = torch.nn.functional.softmax(attention_scores_41, dim=-1) + attention_scores_41 = None + attention_probs_27 = torch.nn.functional.dropout( + attention_probs_26, 0.1, False, False + ) + attention_probs_26 = None + context_layer_39 = torch.matmul(attention_probs_27, value_layer_27) + attention_probs_27 = value_layer_27 = None + permute_13 = context_layer_39.permute(0, 2, 1, 3) + context_layer_39 = None + context_layer_40 = permute_13.contiguous() + permute_13 = None + context_layer_41 = context_layer_40.view((1, 20, 1024)) + context_layer_40 = None + hidden_states_104 = torch._C._nn.linear( + context_layer_41, + l_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_output_modules_dense_parameters_bias_, + ) + context_layer_41 = l_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_105 = torch.nn.functional.dropout( + hidden_states_104, 0.1, False, False + ) + hidden_states_104 = None + add_41 = hidden_states_105 + hidden_states_103 + hidden_states_105 = hidden_states_103 = None + hidden_states_106 = torch.nn.functional.layer_norm( + add_41, + (1024,), + l_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_output_modules_layer_norm_parameters_bias_, + 1e-12, + ) + add_41 = l_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = (None) + hidden_states_107 = torch._C._nn.linear( + hidden_states_106, + l_self_modules_encoder_modules_layer_modules_13_modules_intermediate_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_13_modules_intermediate_modules_dense_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_13_modules_intermediate_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_13_modules_intermediate_modules_dense_parameters_bias_ = (None) + hidden_states_108 = torch._C._nn.gelu(hidden_states_107) + hidden_states_107 = None + hidden_states_109 = torch._C._nn.linear( + hidden_states_108, + l_self_modules_encoder_modules_layer_modules_13_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_13_modules_output_modules_dense_parameters_bias_, + ) + hidden_states_108 = l_self_modules_encoder_modules_layer_modules_13_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_13_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_110 = torch.nn.functional.dropout( + hidden_states_109, 0.1, False, False + ) + hidden_states_109 = None + add_42 = hidden_states_110 + hidden_states_106 + hidden_states_110 = hidden_states_106 = None + hidden_states_111 = torch.nn.functional.layer_norm( + add_42, + (1024,), + l_self_modules_encoder_modules_layer_modules_13_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_13_modules_output_modules_layer_norm_parameters_bias_, + 1e-12, + ) + add_42 = l_self_modules_encoder_modules_layer_modules_13_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_13_modules_output_modules_layer_norm_parameters_bias_ = (None) + query_layer_28 = torch._C._nn.linear( + hidden_states_111, + l_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_self_modules_query_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_self_modules_query_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_self_modules_query_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_self_modules_query_parameters_bias_ = (None) + view_56 = query_layer_28.view(1, -1, 16, 64) + query_layer_28 = None + query_layer_29 = view_56.transpose(1, 2) + view_56 = None + key_layer_28 = torch._C._nn.linear( + hidden_states_111, + l_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_self_modules_key_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_self_modules_key_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_self_modules_key_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_self_modules_key_parameters_bias_ = (None) + view_57 = key_layer_28.view(1, -1, 16, 64) + key_layer_28 = None + key_layer_29 = view_57.transpose(1, 2) + view_57 = None + value_layer_28 = torch._C._nn.linear( + hidden_states_111, + l_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_self_modules_value_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_self_modules_value_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_self_modules_value_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_self_modules_value_parameters_bias_ = (None) + view_58 = value_layer_28.view(1, -1, 16, 64) + value_layer_28 = None + value_layer_29 = view_58.transpose(1, 2) + view_58 = None + transpose_59 = key_layer_29.transpose(-1, -2) + key_layer_29 = None + attention_scores_42 = torch.matmul(query_layer_29, transpose_59) + query_layer_29 = transpose_59 = None + attention_scores_43 = attention_scores_42 / 8.0 + attention_scores_42 = None + attention_scores_44 = attention_scores_43 + extended_attention_mask_2 + attention_scores_43 = None + attention_probs_28 = torch.nn.functional.softmax(attention_scores_44, dim=-1) + attention_scores_44 = None + attention_probs_29 = torch.nn.functional.dropout( + attention_probs_28, 0.1, False, False + ) + attention_probs_28 = None + context_layer_42 = torch.matmul(attention_probs_29, value_layer_29) + attention_probs_29 = value_layer_29 = None + permute_14 = context_layer_42.permute(0, 2, 1, 3) + context_layer_42 = None + context_layer_43 = permute_14.contiguous() + permute_14 = None + context_layer_44 = context_layer_43.view((1, 20, 1024)) + context_layer_43 = None + hidden_states_112 = torch._C._nn.linear( + context_layer_44, + l_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_output_modules_dense_parameters_bias_, + ) + context_layer_44 = l_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_113 = torch.nn.functional.dropout( + hidden_states_112, 0.1, False, False + ) + hidden_states_112 = None + add_44 = hidden_states_113 + hidden_states_111 + hidden_states_113 = hidden_states_111 = None + hidden_states_114 = torch.nn.functional.layer_norm( + add_44, + (1024,), + l_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_output_modules_layer_norm_parameters_bias_, + 1e-12, + ) + add_44 = l_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = (None) + hidden_states_115 = torch._C._nn.linear( + hidden_states_114, + l_self_modules_encoder_modules_layer_modules_14_modules_intermediate_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_14_modules_intermediate_modules_dense_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_14_modules_intermediate_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_14_modules_intermediate_modules_dense_parameters_bias_ = (None) + hidden_states_116 = torch._C._nn.gelu(hidden_states_115) + hidden_states_115 = None + hidden_states_117 = torch._C._nn.linear( + hidden_states_116, + l_self_modules_encoder_modules_layer_modules_14_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_14_modules_output_modules_dense_parameters_bias_, + ) + hidden_states_116 = l_self_modules_encoder_modules_layer_modules_14_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_14_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_118 = torch.nn.functional.dropout( + hidden_states_117, 0.1, False, False + ) + hidden_states_117 = None + add_45 = hidden_states_118 + hidden_states_114 + hidden_states_118 = hidden_states_114 = None + hidden_states_119 = torch.nn.functional.layer_norm( + add_45, + (1024,), + l_self_modules_encoder_modules_layer_modules_14_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_14_modules_output_modules_layer_norm_parameters_bias_, + 1e-12, + ) + add_45 = l_self_modules_encoder_modules_layer_modules_14_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_14_modules_output_modules_layer_norm_parameters_bias_ = (None) + query_layer_30 = torch._C._nn.linear( + hidden_states_119, + l_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_self_modules_query_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_self_modules_query_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_self_modules_query_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_self_modules_query_parameters_bias_ = (None) + view_60 = query_layer_30.view(1, -1, 16, 64) + query_layer_30 = None + query_layer_31 = view_60.transpose(1, 2) + view_60 = None + key_layer_30 = torch._C._nn.linear( + hidden_states_119, + l_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_self_modules_key_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_self_modules_key_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_self_modules_key_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_self_modules_key_parameters_bias_ = (None) + view_61 = key_layer_30.view(1, -1, 16, 64) + key_layer_30 = None + key_layer_31 = view_61.transpose(1, 2) + view_61 = None + value_layer_30 = torch._C._nn.linear( + hidden_states_119, + l_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_self_modules_value_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_self_modules_value_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_self_modules_value_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_self_modules_value_parameters_bias_ = (None) + view_62 = value_layer_30.view(1, -1, 16, 64) + value_layer_30 = None + value_layer_31 = view_62.transpose(1, 2) + view_62 = None + transpose_63 = key_layer_31.transpose(-1, -2) + key_layer_31 = None + attention_scores_45 = torch.matmul(query_layer_31, transpose_63) + query_layer_31 = transpose_63 = None + attention_scores_46 = attention_scores_45 / 8.0 + attention_scores_45 = None + attention_scores_47 = attention_scores_46 + extended_attention_mask_2 + attention_scores_46 = None + attention_probs_30 = torch.nn.functional.softmax(attention_scores_47, dim=-1) + attention_scores_47 = None + attention_probs_31 = torch.nn.functional.dropout( + attention_probs_30, 0.1, False, False + ) + attention_probs_30 = None + context_layer_45 = torch.matmul(attention_probs_31, value_layer_31) + attention_probs_31 = value_layer_31 = None + permute_15 = context_layer_45.permute(0, 2, 1, 3) + context_layer_45 = None + context_layer_46 = permute_15.contiguous() + permute_15 = None + context_layer_47 = context_layer_46.view((1, 20, 1024)) + context_layer_46 = None + hidden_states_120 = torch._C._nn.linear( + context_layer_47, + l_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_output_modules_dense_parameters_bias_, + ) + context_layer_47 = l_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_121 = torch.nn.functional.dropout( + hidden_states_120, 0.1, False, False + ) + hidden_states_120 = None + add_47 = hidden_states_121 + hidden_states_119 + hidden_states_121 = hidden_states_119 = None + hidden_states_122 = torch.nn.functional.layer_norm( + add_47, + (1024,), + l_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_output_modules_layer_norm_parameters_bias_, + 1e-12, + ) + add_47 = l_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = (None) + hidden_states_123 = torch._C._nn.linear( + hidden_states_122, + l_self_modules_encoder_modules_layer_modules_15_modules_intermediate_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_15_modules_intermediate_modules_dense_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_15_modules_intermediate_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_15_modules_intermediate_modules_dense_parameters_bias_ = (None) + hidden_states_124 = torch._C._nn.gelu(hidden_states_123) + hidden_states_123 = None + hidden_states_125 = torch._C._nn.linear( + hidden_states_124, + l_self_modules_encoder_modules_layer_modules_15_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_15_modules_output_modules_dense_parameters_bias_, + ) + hidden_states_124 = l_self_modules_encoder_modules_layer_modules_15_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_15_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_126 = torch.nn.functional.dropout( + hidden_states_125, 0.1, False, False + ) + hidden_states_125 = None + add_48 = hidden_states_126 + hidden_states_122 + hidden_states_126 = hidden_states_122 = None + hidden_states_127 = torch.nn.functional.layer_norm( + add_48, + (1024,), + l_self_modules_encoder_modules_layer_modules_15_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_15_modules_output_modules_layer_norm_parameters_bias_, + 1e-12, + ) + add_48 = l_self_modules_encoder_modules_layer_modules_15_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_15_modules_output_modules_layer_norm_parameters_bias_ = (None) + query_layer_32 = torch._C._nn.linear( + hidden_states_127, + l_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_self_modules_query_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_self_modules_query_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_self_modules_query_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_self_modules_query_parameters_bias_ = (None) + view_64 = query_layer_32.view(1, -1, 16, 64) + query_layer_32 = None + query_layer_33 = view_64.transpose(1, 2) + view_64 = None + key_layer_32 = torch._C._nn.linear( + hidden_states_127, + l_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_self_modules_key_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_self_modules_key_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_self_modules_key_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_self_modules_key_parameters_bias_ = (None) + view_65 = key_layer_32.view(1, -1, 16, 64) + key_layer_32 = None + key_layer_33 = view_65.transpose(1, 2) + view_65 = None + value_layer_32 = torch._C._nn.linear( + hidden_states_127, + l_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_self_modules_value_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_self_modules_value_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_self_modules_value_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_self_modules_value_parameters_bias_ = (None) + view_66 = value_layer_32.view(1, -1, 16, 64) + value_layer_32 = None + value_layer_33 = view_66.transpose(1, 2) + view_66 = None + transpose_67 = key_layer_33.transpose(-1, -2) + key_layer_33 = None + attention_scores_48 = torch.matmul(query_layer_33, transpose_67) + query_layer_33 = transpose_67 = None + attention_scores_49 = attention_scores_48 / 8.0 + attention_scores_48 = None + attention_scores_50 = attention_scores_49 + extended_attention_mask_2 + attention_scores_49 = None + attention_probs_32 = torch.nn.functional.softmax(attention_scores_50, dim=-1) + attention_scores_50 = None + attention_probs_33 = torch.nn.functional.dropout( + attention_probs_32, 0.1, False, False + ) + attention_probs_32 = None + context_layer_48 = torch.matmul(attention_probs_33, value_layer_33) + attention_probs_33 = value_layer_33 = None + permute_16 = context_layer_48.permute(0, 2, 1, 3) + context_layer_48 = None + context_layer_49 = permute_16.contiguous() + permute_16 = None + context_layer_50 = context_layer_49.view((1, 20, 1024)) + context_layer_49 = None + hidden_states_128 = torch._C._nn.linear( + context_layer_50, + l_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_output_modules_dense_parameters_bias_, + ) + context_layer_50 = l_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_129 = torch.nn.functional.dropout( + hidden_states_128, 0.1, False, False + ) + hidden_states_128 = None + add_50 = hidden_states_129 + hidden_states_127 + hidden_states_129 = hidden_states_127 = None + hidden_states_130 = torch.nn.functional.layer_norm( + add_50, + (1024,), + l_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_output_modules_layer_norm_parameters_bias_, + 1e-12, + ) + add_50 = l_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = (None) + hidden_states_131 = torch._C._nn.linear( + hidden_states_130, + l_self_modules_encoder_modules_layer_modules_16_modules_intermediate_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_16_modules_intermediate_modules_dense_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_16_modules_intermediate_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_16_modules_intermediate_modules_dense_parameters_bias_ = (None) + hidden_states_132 = torch._C._nn.gelu(hidden_states_131) + hidden_states_131 = None + hidden_states_133 = torch._C._nn.linear( + hidden_states_132, + l_self_modules_encoder_modules_layer_modules_16_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_16_modules_output_modules_dense_parameters_bias_, + ) + hidden_states_132 = l_self_modules_encoder_modules_layer_modules_16_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_16_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_134 = torch.nn.functional.dropout( + hidden_states_133, 0.1, False, False + ) + hidden_states_133 = None + add_51 = hidden_states_134 + hidden_states_130 + hidden_states_134 = hidden_states_130 = None + hidden_states_135 = torch.nn.functional.layer_norm( + add_51, + (1024,), + l_self_modules_encoder_modules_layer_modules_16_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_16_modules_output_modules_layer_norm_parameters_bias_, + 1e-12, + ) + add_51 = l_self_modules_encoder_modules_layer_modules_16_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_16_modules_output_modules_layer_norm_parameters_bias_ = (None) + query_layer_34 = torch._C._nn.linear( + hidden_states_135, + l_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_self_modules_query_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_self_modules_query_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_self_modules_query_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_self_modules_query_parameters_bias_ = (None) + view_68 = query_layer_34.view(1, -1, 16, 64) + query_layer_34 = None + query_layer_35 = view_68.transpose(1, 2) + view_68 = None + key_layer_34 = torch._C._nn.linear( + hidden_states_135, + l_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_self_modules_key_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_self_modules_key_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_self_modules_key_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_self_modules_key_parameters_bias_ = (None) + view_69 = key_layer_34.view(1, -1, 16, 64) + key_layer_34 = None + key_layer_35 = view_69.transpose(1, 2) + view_69 = None + value_layer_34 = torch._C._nn.linear( + hidden_states_135, + l_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_self_modules_value_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_self_modules_value_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_self_modules_value_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_self_modules_value_parameters_bias_ = (None) + view_70 = value_layer_34.view(1, -1, 16, 64) + value_layer_34 = None + value_layer_35 = view_70.transpose(1, 2) + view_70 = None + transpose_71 = key_layer_35.transpose(-1, -2) + key_layer_35 = None + attention_scores_51 = torch.matmul(query_layer_35, transpose_71) + query_layer_35 = transpose_71 = None + attention_scores_52 = attention_scores_51 / 8.0 + attention_scores_51 = None + attention_scores_53 = attention_scores_52 + extended_attention_mask_2 + attention_scores_52 = None + attention_probs_34 = torch.nn.functional.softmax(attention_scores_53, dim=-1) + attention_scores_53 = None + attention_probs_35 = torch.nn.functional.dropout( + attention_probs_34, 0.1, False, False + ) + attention_probs_34 = None + context_layer_51 = torch.matmul(attention_probs_35, value_layer_35) + attention_probs_35 = value_layer_35 = None + permute_17 = context_layer_51.permute(0, 2, 1, 3) + context_layer_51 = None + context_layer_52 = permute_17.contiguous() + permute_17 = None + context_layer_53 = context_layer_52.view((1, 20, 1024)) + context_layer_52 = None + hidden_states_136 = torch._C._nn.linear( + context_layer_53, + l_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_output_modules_dense_parameters_bias_, + ) + context_layer_53 = l_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_137 = torch.nn.functional.dropout( + hidden_states_136, 0.1, False, False + ) + hidden_states_136 = None + add_53 = hidden_states_137 + hidden_states_135 + hidden_states_137 = hidden_states_135 = None + hidden_states_138 = torch.nn.functional.layer_norm( + add_53, + (1024,), + l_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_output_modules_layer_norm_parameters_bias_, + 1e-12, + ) + add_53 = l_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = (None) + hidden_states_139 = torch._C._nn.linear( + hidden_states_138, + l_self_modules_encoder_modules_layer_modules_17_modules_intermediate_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_17_modules_intermediate_modules_dense_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_17_modules_intermediate_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_17_modules_intermediate_modules_dense_parameters_bias_ = (None) + hidden_states_140 = torch._C._nn.gelu(hidden_states_139) + hidden_states_139 = None + hidden_states_141 = torch._C._nn.linear( + hidden_states_140, + l_self_modules_encoder_modules_layer_modules_17_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_17_modules_output_modules_dense_parameters_bias_, + ) + hidden_states_140 = l_self_modules_encoder_modules_layer_modules_17_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_17_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_142 = torch.nn.functional.dropout( + hidden_states_141, 0.1, False, False + ) + hidden_states_141 = None + add_54 = hidden_states_142 + hidden_states_138 + hidden_states_142 = hidden_states_138 = None + hidden_states_143 = torch.nn.functional.layer_norm( + add_54, + (1024,), + l_self_modules_encoder_modules_layer_modules_17_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_17_modules_output_modules_layer_norm_parameters_bias_, + 1e-12, + ) + add_54 = l_self_modules_encoder_modules_layer_modules_17_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_17_modules_output_modules_layer_norm_parameters_bias_ = (None) + query_layer_36 = torch._C._nn.linear( + hidden_states_143, + l_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_self_modules_query_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_self_modules_query_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_self_modules_query_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_self_modules_query_parameters_bias_ = (None) + view_72 = query_layer_36.view(1, -1, 16, 64) + query_layer_36 = None + query_layer_37 = view_72.transpose(1, 2) + view_72 = None + key_layer_36 = torch._C._nn.linear( + hidden_states_143, + l_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_self_modules_key_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_self_modules_key_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_self_modules_key_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_self_modules_key_parameters_bias_ = (None) + view_73 = key_layer_36.view(1, -1, 16, 64) + key_layer_36 = None + key_layer_37 = view_73.transpose(1, 2) + view_73 = None + value_layer_36 = torch._C._nn.linear( + hidden_states_143, + l_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_self_modules_value_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_self_modules_value_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_self_modules_value_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_self_modules_value_parameters_bias_ = (None) + view_74 = value_layer_36.view(1, -1, 16, 64) + value_layer_36 = None + value_layer_37 = view_74.transpose(1, 2) + view_74 = None + transpose_75 = key_layer_37.transpose(-1, -2) + key_layer_37 = None + attention_scores_54 = torch.matmul(query_layer_37, transpose_75) + query_layer_37 = transpose_75 = None + attention_scores_55 = attention_scores_54 / 8.0 + attention_scores_54 = None + attention_scores_56 = attention_scores_55 + extended_attention_mask_2 + attention_scores_55 = None + attention_probs_36 = torch.nn.functional.softmax(attention_scores_56, dim=-1) + attention_scores_56 = None + attention_probs_37 = torch.nn.functional.dropout( + attention_probs_36, 0.1, False, False + ) + attention_probs_36 = None + context_layer_54 = torch.matmul(attention_probs_37, value_layer_37) + attention_probs_37 = value_layer_37 = None + permute_18 = context_layer_54.permute(0, 2, 1, 3) + context_layer_54 = None + context_layer_55 = permute_18.contiguous() + permute_18 = None + context_layer_56 = context_layer_55.view((1, 20, 1024)) + context_layer_55 = None + hidden_states_144 = torch._C._nn.linear( + context_layer_56, + l_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_output_modules_dense_parameters_bias_, + ) + context_layer_56 = l_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_145 = torch.nn.functional.dropout( + hidden_states_144, 0.1, False, False + ) + hidden_states_144 = None + add_56 = hidden_states_145 + hidden_states_143 + hidden_states_145 = hidden_states_143 = None + hidden_states_146 = torch.nn.functional.layer_norm( + add_56, + (1024,), + l_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_output_modules_layer_norm_parameters_bias_, + 1e-12, + ) + add_56 = l_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = (None) + hidden_states_147 = torch._C._nn.linear( + hidden_states_146, + l_self_modules_encoder_modules_layer_modules_18_modules_intermediate_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_18_modules_intermediate_modules_dense_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_18_modules_intermediate_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_18_modules_intermediate_modules_dense_parameters_bias_ = (None) + hidden_states_148 = torch._C._nn.gelu(hidden_states_147) + hidden_states_147 = None + hidden_states_149 = torch._C._nn.linear( + hidden_states_148, + l_self_modules_encoder_modules_layer_modules_18_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_18_modules_output_modules_dense_parameters_bias_, + ) + hidden_states_148 = l_self_modules_encoder_modules_layer_modules_18_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_18_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_150 = torch.nn.functional.dropout( + hidden_states_149, 0.1, False, False + ) + hidden_states_149 = None + add_57 = hidden_states_150 + hidden_states_146 + hidden_states_150 = hidden_states_146 = None + hidden_states_151 = torch.nn.functional.layer_norm( + add_57, + (1024,), + l_self_modules_encoder_modules_layer_modules_18_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_18_modules_output_modules_layer_norm_parameters_bias_, + 1e-12, + ) + add_57 = l_self_modules_encoder_modules_layer_modules_18_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_18_modules_output_modules_layer_norm_parameters_bias_ = (None) + query_layer_38 = torch._C._nn.linear( + hidden_states_151, + l_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_self_modules_query_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_self_modules_query_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_self_modules_query_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_self_modules_query_parameters_bias_ = (None) + view_76 = query_layer_38.view(1, -1, 16, 64) + query_layer_38 = None + query_layer_39 = view_76.transpose(1, 2) + view_76 = None + key_layer_38 = torch._C._nn.linear( + hidden_states_151, + l_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_self_modules_key_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_self_modules_key_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_self_modules_key_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_self_modules_key_parameters_bias_ = (None) + view_77 = key_layer_38.view(1, -1, 16, 64) + key_layer_38 = None + key_layer_39 = view_77.transpose(1, 2) + view_77 = None + value_layer_38 = torch._C._nn.linear( + hidden_states_151, + l_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_self_modules_value_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_self_modules_value_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_self_modules_value_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_self_modules_value_parameters_bias_ = (None) + view_78 = value_layer_38.view(1, -1, 16, 64) + value_layer_38 = None + value_layer_39 = view_78.transpose(1, 2) + view_78 = None + transpose_79 = key_layer_39.transpose(-1, -2) + key_layer_39 = None + attention_scores_57 = torch.matmul(query_layer_39, transpose_79) + query_layer_39 = transpose_79 = None + attention_scores_58 = attention_scores_57 / 8.0 + attention_scores_57 = None + attention_scores_59 = attention_scores_58 + extended_attention_mask_2 + attention_scores_58 = None + attention_probs_38 = torch.nn.functional.softmax(attention_scores_59, dim=-1) + attention_scores_59 = None + attention_probs_39 = torch.nn.functional.dropout( + attention_probs_38, 0.1, False, False + ) + attention_probs_38 = None + context_layer_57 = torch.matmul(attention_probs_39, value_layer_39) + attention_probs_39 = value_layer_39 = None + permute_19 = context_layer_57.permute(0, 2, 1, 3) + context_layer_57 = None + context_layer_58 = permute_19.contiguous() + permute_19 = None + context_layer_59 = context_layer_58.view((1, 20, 1024)) + context_layer_58 = None + hidden_states_152 = torch._C._nn.linear( + context_layer_59, + l_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_output_modules_dense_parameters_bias_, + ) + context_layer_59 = l_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_153 = torch.nn.functional.dropout( + hidden_states_152, 0.1, False, False + ) + hidden_states_152 = None + add_59 = hidden_states_153 + hidden_states_151 + hidden_states_153 = hidden_states_151 = None + hidden_states_154 = torch.nn.functional.layer_norm( + add_59, + (1024,), + l_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_output_modules_layer_norm_parameters_bias_, + 1e-12, + ) + add_59 = l_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = (None) + hidden_states_155 = torch._C._nn.linear( + hidden_states_154, + l_self_modules_encoder_modules_layer_modules_19_modules_intermediate_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_19_modules_intermediate_modules_dense_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_19_modules_intermediate_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_19_modules_intermediate_modules_dense_parameters_bias_ = (None) + hidden_states_156 = torch._C._nn.gelu(hidden_states_155) + hidden_states_155 = None + hidden_states_157 = torch._C._nn.linear( + hidden_states_156, + l_self_modules_encoder_modules_layer_modules_19_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_19_modules_output_modules_dense_parameters_bias_, + ) + hidden_states_156 = l_self_modules_encoder_modules_layer_modules_19_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_19_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_158 = torch.nn.functional.dropout( + hidden_states_157, 0.1, False, False + ) + hidden_states_157 = None + add_60 = hidden_states_158 + hidden_states_154 + hidden_states_158 = hidden_states_154 = None + hidden_states_159 = torch.nn.functional.layer_norm( + add_60, + (1024,), + l_self_modules_encoder_modules_layer_modules_19_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_19_modules_output_modules_layer_norm_parameters_bias_, + 1e-12, + ) + add_60 = l_self_modules_encoder_modules_layer_modules_19_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_19_modules_output_modules_layer_norm_parameters_bias_ = (None) + query_layer_40 = torch._C._nn.linear( + hidden_states_159, + l_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_self_modules_query_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_self_modules_query_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_self_modules_query_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_self_modules_query_parameters_bias_ = (None) + view_80 = query_layer_40.view(1, -1, 16, 64) + query_layer_40 = None + query_layer_41 = view_80.transpose(1, 2) + view_80 = None + key_layer_40 = torch._C._nn.linear( + hidden_states_159, + l_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_self_modules_key_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_self_modules_key_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_self_modules_key_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_self_modules_key_parameters_bias_ = (None) + view_81 = key_layer_40.view(1, -1, 16, 64) + key_layer_40 = None + key_layer_41 = view_81.transpose(1, 2) + view_81 = None + value_layer_40 = torch._C._nn.linear( + hidden_states_159, + l_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_self_modules_value_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_self_modules_value_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_self_modules_value_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_self_modules_value_parameters_bias_ = (None) + view_82 = value_layer_40.view(1, -1, 16, 64) + value_layer_40 = None + value_layer_41 = view_82.transpose(1, 2) + view_82 = None + transpose_83 = key_layer_41.transpose(-1, -2) + key_layer_41 = None + attention_scores_60 = torch.matmul(query_layer_41, transpose_83) + query_layer_41 = transpose_83 = None + attention_scores_61 = attention_scores_60 / 8.0 + attention_scores_60 = None + attention_scores_62 = attention_scores_61 + extended_attention_mask_2 + attention_scores_61 = None + attention_probs_40 = torch.nn.functional.softmax(attention_scores_62, dim=-1) + attention_scores_62 = None + attention_probs_41 = torch.nn.functional.dropout( + attention_probs_40, 0.1, False, False + ) + attention_probs_40 = None + context_layer_60 = torch.matmul(attention_probs_41, value_layer_41) + attention_probs_41 = value_layer_41 = None + permute_20 = context_layer_60.permute(0, 2, 1, 3) + context_layer_60 = None + context_layer_61 = permute_20.contiguous() + permute_20 = None + context_layer_62 = context_layer_61.view((1, 20, 1024)) + context_layer_61 = None + hidden_states_160 = torch._C._nn.linear( + context_layer_62, + l_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_output_modules_dense_parameters_bias_, + ) + context_layer_62 = l_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_161 = torch.nn.functional.dropout( + hidden_states_160, 0.1, False, False + ) + hidden_states_160 = None + add_62 = hidden_states_161 + hidden_states_159 + hidden_states_161 = hidden_states_159 = None + hidden_states_162 = torch.nn.functional.layer_norm( + add_62, + (1024,), + l_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_output_modules_layer_norm_parameters_bias_, + 1e-12, + ) + add_62 = l_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = (None) + hidden_states_163 = torch._C._nn.linear( + hidden_states_162, + l_self_modules_encoder_modules_layer_modules_20_modules_intermediate_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_20_modules_intermediate_modules_dense_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_20_modules_intermediate_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_20_modules_intermediate_modules_dense_parameters_bias_ = (None) + hidden_states_164 = torch._C._nn.gelu(hidden_states_163) + hidden_states_163 = None + hidden_states_165 = torch._C._nn.linear( + hidden_states_164, + l_self_modules_encoder_modules_layer_modules_20_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_20_modules_output_modules_dense_parameters_bias_, + ) + hidden_states_164 = l_self_modules_encoder_modules_layer_modules_20_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_20_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_166 = torch.nn.functional.dropout( + hidden_states_165, 0.1, False, False + ) + hidden_states_165 = None + add_63 = hidden_states_166 + hidden_states_162 + hidden_states_166 = hidden_states_162 = None + hidden_states_167 = torch.nn.functional.layer_norm( + add_63, + (1024,), + l_self_modules_encoder_modules_layer_modules_20_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_20_modules_output_modules_layer_norm_parameters_bias_, + 1e-12, + ) + add_63 = l_self_modules_encoder_modules_layer_modules_20_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_20_modules_output_modules_layer_norm_parameters_bias_ = (None) + query_layer_42 = torch._C._nn.linear( + hidden_states_167, + l_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_self_modules_query_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_self_modules_query_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_self_modules_query_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_self_modules_query_parameters_bias_ = (None) + view_84 = query_layer_42.view(1, -1, 16, 64) + query_layer_42 = None + query_layer_43 = view_84.transpose(1, 2) + view_84 = None + key_layer_42 = torch._C._nn.linear( + hidden_states_167, + l_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_self_modules_key_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_self_modules_key_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_self_modules_key_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_self_modules_key_parameters_bias_ = (None) + view_85 = key_layer_42.view(1, -1, 16, 64) + key_layer_42 = None + key_layer_43 = view_85.transpose(1, 2) + view_85 = None + value_layer_42 = torch._C._nn.linear( + hidden_states_167, + l_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_self_modules_value_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_self_modules_value_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_self_modules_value_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_self_modules_value_parameters_bias_ = (None) + view_86 = value_layer_42.view(1, -1, 16, 64) + value_layer_42 = None + value_layer_43 = view_86.transpose(1, 2) + view_86 = None + transpose_87 = key_layer_43.transpose(-1, -2) + key_layer_43 = None + attention_scores_63 = torch.matmul(query_layer_43, transpose_87) + query_layer_43 = transpose_87 = None + attention_scores_64 = attention_scores_63 / 8.0 + attention_scores_63 = None + attention_scores_65 = attention_scores_64 + extended_attention_mask_2 + attention_scores_64 = None + attention_probs_42 = torch.nn.functional.softmax(attention_scores_65, dim=-1) + attention_scores_65 = None + attention_probs_43 = torch.nn.functional.dropout( + attention_probs_42, 0.1, False, False + ) + attention_probs_42 = None + context_layer_63 = torch.matmul(attention_probs_43, value_layer_43) + attention_probs_43 = value_layer_43 = None + permute_21 = context_layer_63.permute(0, 2, 1, 3) + context_layer_63 = None + context_layer_64 = permute_21.contiguous() + permute_21 = None + context_layer_65 = context_layer_64.view((1, 20, 1024)) + context_layer_64 = None + hidden_states_168 = torch._C._nn.linear( + context_layer_65, + l_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_output_modules_dense_parameters_bias_, + ) + context_layer_65 = l_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_169 = torch.nn.functional.dropout( + hidden_states_168, 0.1, False, False + ) + hidden_states_168 = None + add_65 = hidden_states_169 + hidden_states_167 + hidden_states_169 = hidden_states_167 = None + hidden_states_170 = torch.nn.functional.layer_norm( + add_65, + (1024,), + l_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_output_modules_layer_norm_parameters_bias_, + 1e-12, + ) + add_65 = l_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = (None) + hidden_states_171 = torch._C._nn.linear( + hidden_states_170, + l_self_modules_encoder_modules_layer_modules_21_modules_intermediate_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_21_modules_intermediate_modules_dense_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_21_modules_intermediate_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_21_modules_intermediate_modules_dense_parameters_bias_ = (None) + hidden_states_172 = torch._C._nn.gelu(hidden_states_171) + hidden_states_171 = None + hidden_states_173 = torch._C._nn.linear( + hidden_states_172, + l_self_modules_encoder_modules_layer_modules_21_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_21_modules_output_modules_dense_parameters_bias_, + ) + hidden_states_172 = l_self_modules_encoder_modules_layer_modules_21_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_21_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_174 = torch.nn.functional.dropout( + hidden_states_173, 0.1, False, False + ) + hidden_states_173 = None + add_66 = hidden_states_174 + hidden_states_170 + hidden_states_174 = hidden_states_170 = None + hidden_states_175 = torch.nn.functional.layer_norm( + add_66, + (1024,), + l_self_modules_encoder_modules_layer_modules_21_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_21_modules_output_modules_layer_norm_parameters_bias_, + 1e-12, + ) + add_66 = l_self_modules_encoder_modules_layer_modules_21_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_21_modules_output_modules_layer_norm_parameters_bias_ = (None) + query_layer_44 = torch._C._nn.linear( + hidden_states_175, + l_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_self_modules_query_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_self_modules_query_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_self_modules_query_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_self_modules_query_parameters_bias_ = (None) + view_88 = query_layer_44.view(1, -1, 16, 64) + query_layer_44 = None + query_layer_45 = view_88.transpose(1, 2) + view_88 = None + key_layer_44 = torch._C._nn.linear( + hidden_states_175, + l_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_self_modules_key_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_self_modules_key_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_self_modules_key_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_self_modules_key_parameters_bias_ = (None) + view_89 = key_layer_44.view(1, -1, 16, 64) + key_layer_44 = None + key_layer_45 = view_89.transpose(1, 2) + view_89 = None + value_layer_44 = torch._C._nn.linear( + hidden_states_175, + l_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_self_modules_value_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_self_modules_value_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_self_modules_value_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_self_modules_value_parameters_bias_ = (None) + view_90 = value_layer_44.view(1, -1, 16, 64) + value_layer_44 = None + value_layer_45 = view_90.transpose(1, 2) + view_90 = None + transpose_91 = key_layer_45.transpose(-1, -2) + key_layer_45 = None + attention_scores_66 = torch.matmul(query_layer_45, transpose_91) + query_layer_45 = transpose_91 = None + attention_scores_67 = attention_scores_66 / 8.0 + attention_scores_66 = None + attention_scores_68 = attention_scores_67 + extended_attention_mask_2 + attention_scores_67 = None + attention_probs_44 = torch.nn.functional.softmax(attention_scores_68, dim=-1) + attention_scores_68 = None + attention_probs_45 = torch.nn.functional.dropout( + attention_probs_44, 0.1, False, False + ) + attention_probs_44 = None + context_layer_66 = torch.matmul(attention_probs_45, value_layer_45) + attention_probs_45 = value_layer_45 = None + permute_22 = context_layer_66.permute(0, 2, 1, 3) + context_layer_66 = None + context_layer_67 = permute_22.contiguous() + permute_22 = None + context_layer_68 = context_layer_67.view((1, 20, 1024)) + context_layer_67 = None + hidden_states_176 = torch._C._nn.linear( + context_layer_68, + l_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_output_modules_dense_parameters_bias_, + ) + context_layer_68 = l_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_177 = torch.nn.functional.dropout( + hidden_states_176, 0.1, False, False + ) + hidden_states_176 = None + add_68 = hidden_states_177 + hidden_states_175 + hidden_states_177 = hidden_states_175 = None + hidden_states_178 = torch.nn.functional.layer_norm( + add_68, + (1024,), + l_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_output_modules_layer_norm_parameters_bias_, + 1e-12, + ) + add_68 = l_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = (None) + hidden_states_179 = torch._C._nn.linear( + hidden_states_178, + l_self_modules_encoder_modules_layer_modules_22_modules_intermediate_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_22_modules_intermediate_modules_dense_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_22_modules_intermediate_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_22_modules_intermediate_modules_dense_parameters_bias_ = (None) + hidden_states_180 = torch._C._nn.gelu(hidden_states_179) + hidden_states_179 = None + hidden_states_181 = torch._C._nn.linear( + hidden_states_180, + l_self_modules_encoder_modules_layer_modules_22_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_22_modules_output_modules_dense_parameters_bias_, + ) + hidden_states_180 = l_self_modules_encoder_modules_layer_modules_22_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_22_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_182 = torch.nn.functional.dropout( + hidden_states_181, 0.1, False, False + ) + hidden_states_181 = None + add_69 = hidden_states_182 + hidden_states_178 + hidden_states_182 = hidden_states_178 = None + hidden_states_183 = torch.nn.functional.layer_norm( + add_69, + (1024,), + l_self_modules_encoder_modules_layer_modules_22_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_22_modules_output_modules_layer_norm_parameters_bias_, + 1e-12, + ) + add_69 = l_self_modules_encoder_modules_layer_modules_22_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_22_modules_output_modules_layer_norm_parameters_bias_ = (None) + query_layer_46 = torch._C._nn.linear( + hidden_states_183, + l_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_self_modules_query_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_self_modules_query_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_self_modules_query_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_self_modules_query_parameters_bias_ = (None) + view_92 = query_layer_46.view(1, -1, 16, 64) + query_layer_46 = None + query_layer_47 = view_92.transpose(1, 2) + view_92 = None + key_layer_46 = torch._C._nn.linear( + hidden_states_183, + l_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_self_modules_key_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_self_modules_key_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_self_modules_key_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_self_modules_key_parameters_bias_ = (None) + view_93 = key_layer_46.view(1, -1, 16, 64) + key_layer_46 = None + key_layer_47 = view_93.transpose(1, 2) + view_93 = None + value_layer_46 = torch._C._nn.linear( + hidden_states_183, + l_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_self_modules_value_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_self_modules_value_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_self_modules_value_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_self_modules_value_parameters_bias_ = (None) + view_94 = value_layer_46.view(1, -1, 16, 64) + value_layer_46 = None + value_layer_47 = view_94.transpose(1, 2) + view_94 = None + transpose_95 = key_layer_47.transpose(-1, -2) + key_layer_47 = None + attention_scores_69 = torch.matmul(query_layer_47, transpose_95) + query_layer_47 = transpose_95 = None + attention_scores_70 = attention_scores_69 / 8.0 + attention_scores_69 = None + attention_scores_71 = attention_scores_70 + extended_attention_mask_2 + attention_scores_70 = extended_attention_mask_2 = None + attention_probs_46 = torch.nn.functional.softmax(attention_scores_71, dim=-1) + attention_scores_71 = None + attention_probs_47 = torch.nn.functional.dropout( + attention_probs_46, 0.1, False, False + ) + attention_probs_46 = None + context_layer_69 = torch.matmul(attention_probs_47, value_layer_47) + attention_probs_47 = value_layer_47 = None + permute_23 = context_layer_69.permute(0, 2, 1, 3) + context_layer_69 = None + context_layer_70 = permute_23.contiguous() + permute_23 = None + context_layer_71 = context_layer_70.view((1, 20, 1024)) + context_layer_70 = None + hidden_states_184 = torch._C._nn.linear( + context_layer_71, + l_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_output_modules_dense_parameters_bias_, + ) + context_layer_71 = l_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_185 = torch.nn.functional.dropout( + hidden_states_184, 0.1, False, False + ) + hidden_states_184 = None + add_71 = hidden_states_185 + hidden_states_183 + hidden_states_185 = hidden_states_183 = None + hidden_states_186 = torch.nn.functional.layer_norm( + add_71, + (1024,), + l_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_output_modules_layer_norm_parameters_bias_, + 1e-12, + ) + add_71 = l_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = (None) + hidden_states_187 = torch._C._nn.linear( + hidden_states_186, + l_self_modules_encoder_modules_layer_modules_23_modules_intermediate_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_23_modules_intermediate_modules_dense_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_23_modules_intermediate_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_23_modules_intermediate_modules_dense_parameters_bias_ = (None) + hidden_states_188 = torch._C._nn.gelu(hidden_states_187) + hidden_states_187 = None + hidden_states_189 = torch._C._nn.linear( + hidden_states_188, + l_self_modules_encoder_modules_layer_modules_23_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_23_modules_output_modules_dense_parameters_bias_, + ) + hidden_states_188 = l_self_modules_encoder_modules_layer_modules_23_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_23_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_190 = torch.nn.functional.dropout( + hidden_states_189, 0.1, False, False + ) + hidden_states_189 = None + add_72 = hidden_states_190 + hidden_states_186 + hidden_states_190 = hidden_states_186 = None + hidden_states_191 = torch.nn.functional.layer_norm( + add_72, + (1024,), + l_self_modules_encoder_modules_layer_modules_23_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_23_modules_output_modules_layer_norm_parameters_bias_, + 1e-12, + ) + add_72 = l_self_modules_encoder_modules_layer_modules_23_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_23_modules_output_modules_layer_norm_parameters_bias_ = (None) + return (hidden_states_191,) diff --git a/samples/transformers-auto-model/paulkm_autotrain-lottery_v2-2420075389/weight_meta.py b/samples/transformers-auto-model/paulkm_autotrain-lottery_v2-2420075389/weight_meta.py new file mode 100644 index 000000000..ce9c2bf59 --- /dev/null +++ b/samples/transformers-auto-model/paulkm_autotrain-lottery_v2-2420075389/weight_meta.py @@ -0,0 +1,3950 @@ +class Program_weight_tensor_meta_L_input_ids_: + name = "L_input_ids_" + shape = [1, 20] + dtype = "torch.int64" + device = "cuda:0" + mean = None + std = None + data = [ + 101, + 149, + 12220, + 8914, + 8310, + 11927, + 8168, + 8228, + 9577, + 9808, + 8722, + 8134, + 11300, + 8794, + 8315, + 149, + 12220, + 8118, + 119, + 102, + ] + + +class Program_weight_tensor_meta_L_attention_mask_: + name = "L_attention_mask_" + shape = [1, 20] + dtype = "torch.int64" + device = "cuda:0" + mean = None + std = None + data = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] + + +class Program_weight_tensor_meta_L_token_type_ids_: + name = "L_token_type_ids_" + shape = [1, 20] + dtype = "torch.int64" + device = "cuda:0" + mean = None + std = None + data = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] + + +class Program_weight_tensor_meta_L_self_modules_embeddings_modules_word_embeddings_parameters_weight_: + name = "L_self_modules_embeddings_modules_word_embeddings_parameters_weight_" + shape = [21128, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_embeddings_buffers_position_ids_: + name = "L_self_modules_embeddings_buffers_position_ids_" + shape = [1, 512] + dtype = "torch.int64" + device = "cuda:0" + mean = None + std = None + min_val = 0 + max_val = 511 + + +class Program_weight_tensor_meta_L_self_modules_embeddings_modules_token_type_embeddings_parameters_weight_: + name = "L_self_modules_embeddings_modules_token_type_embeddings_parameters_weight_" + shape = [2, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.001 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_embeddings_modules_position_embeddings_parameters_weight_: + name = "L_self_modules_embeddings_modules_position_embeddings_parameters_weight_" + shape = [512, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_embeddings_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_embeddings_modules_LayerNorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_embeddings_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_embeddings_modules_LayerNorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_query_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_query_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_query_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_query_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_key_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_key_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_key_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_key_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_value_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_value_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_value_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_value_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dense_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_LayerNorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_LayerNorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_intermediate_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_intermediate_modules_dense_parameters_weight_" + shape = [4096, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_intermediate_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_intermediate_modules_dense_parameters_bias_" + shape = [4096] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dense_parameters_weight_" + shape = [1024, 4096] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_LayerNorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_LayerNorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_query_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_query_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_query_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_query_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_key_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_key_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_key_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_key_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_value_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_value_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_value_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_value_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dense_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_LayerNorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_LayerNorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_intermediate_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_intermediate_modules_dense_parameters_weight_" + shape = [4096, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_intermediate_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_intermediate_modules_dense_parameters_bias_" + shape = [4096] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dense_parameters_weight_" + shape = [1024, 4096] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_LayerNorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_LayerNorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_query_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_query_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_query_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_query_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_key_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_key_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_key_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_key_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_value_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_value_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_value_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_value_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_dense_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_LayerNorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_LayerNorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_intermediate_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_intermediate_modules_dense_parameters_weight_" + shape = [4096, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_intermediate_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_intermediate_modules_dense_parameters_bias_" + shape = [4096] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_dense_parameters_weight_" + shape = [1024, 4096] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_LayerNorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_LayerNorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_query_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_query_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_query_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_query_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_key_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_key_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_key_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_key_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_value_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_value_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_value_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_value_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_dense_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_LayerNorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_LayerNorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_intermediate_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_intermediate_modules_dense_parameters_weight_" + shape = [4096, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_intermediate_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_intermediate_modules_dense_parameters_bias_" + shape = [4096] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_dense_parameters_weight_" + shape = [1024, 4096] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_LayerNorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_LayerNorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_query_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_query_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_query_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_query_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_key_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_key_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_key_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_key_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_value_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_value_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_value_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_value_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_dense_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_LayerNorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_LayerNorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_4_modules_intermediate_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_4_modules_intermediate_modules_dense_parameters_weight_" + shape = [4096, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_4_modules_intermediate_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_4_modules_intermediate_modules_dense_parameters_bias_" + shape = [4096] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_4_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_4_modules_output_modules_dense_parameters_weight_" + shape = [1024, 4096] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_4_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_4_modules_output_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_4_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_4_modules_output_modules_LayerNorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_4_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_4_modules_output_modules_LayerNorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_query_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_query_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_query_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_query_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_key_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_key_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_key_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_key_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_value_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_value_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_value_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_value_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_dense_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_LayerNorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_LayerNorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_5_modules_intermediate_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_5_modules_intermediate_modules_dense_parameters_weight_" + shape = [4096, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_5_modules_intermediate_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_5_modules_intermediate_modules_dense_parameters_bias_" + shape = [4096] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_5_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_5_modules_output_modules_dense_parameters_weight_" + shape = [1024, 4096] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_5_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_5_modules_output_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_5_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_5_modules_output_modules_LayerNorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_5_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_5_modules_output_modules_LayerNorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_query_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_query_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_query_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_query_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_key_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_key_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_key_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_key_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_value_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_value_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_value_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_self_modules_value_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_dense_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_LayerNorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_6_modules_attention_modules_output_modules_LayerNorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_6_modules_intermediate_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_6_modules_intermediate_modules_dense_parameters_weight_" + shape = [4096, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_6_modules_intermediate_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_6_modules_intermediate_modules_dense_parameters_bias_" + shape = [4096] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_6_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_6_modules_output_modules_dense_parameters_weight_" + shape = [1024, 4096] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_6_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_6_modules_output_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_6_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_6_modules_output_modules_LayerNorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_6_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_6_modules_output_modules_LayerNorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_query_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_query_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_query_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_query_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_key_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_key_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_key_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_key_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_value_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_value_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_value_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_self_modules_value_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_dense_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_LayerNorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_7_modules_attention_modules_output_modules_LayerNorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_7_modules_intermediate_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_7_modules_intermediate_modules_dense_parameters_weight_" + shape = [4096, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_7_modules_intermediate_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_7_modules_intermediate_modules_dense_parameters_bias_" + shape = [4096] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_7_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_7_modules_output_modules_dense_parameters_weight_" + shape = [1024, 4096] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_7_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_7_modules_output_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_7_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_7_modules_output_modules_LayerNorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_7_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_7_modules_output_modules_LayerNorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_query_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_query_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_query_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_query_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_key_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_key_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_key_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_key_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_value_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_value_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_value_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_self_modules_value_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_dense_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_LayerNorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_8_modules_attention_modules_output_modules_LayerNorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_8_modules_intermediate_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_8_modules_intermediate_modules_dense_parameters_weight_" + shape = [4096, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_8_modules_intermediate_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_8_modules_intermediate_modules_dense_parameters_bias_" + shape = [4096] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_8_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_8_modules_output_modules_dense_parameters_weight_" + shape = [1024, 4096] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_8_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_8_modules_output_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_8_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_8_modules_output_modules_LayerNorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_8_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_8_modules_output_modules_LayerNorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_query_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_query_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_query_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_query_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_key_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_key_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_key_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_key_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_value_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_value_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_value_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_self_modules_value_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_dense_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_LayerNorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_9_modules_attention_modules_output_modules_LayerNorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_9_modules_intermediate_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_9_modules_intermediate_modules_dense_parameters_weight_" + shape = [4096, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_9_modules_intermediate_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_9_modules_intermediate_modules_dense_parameters_bias_" + shape = [4096] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_9_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_9_modules_output_modules_dense_parameters_weight_" + shape = [1024, 4096] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_9_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_9_modules_output_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_9_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_9_modules_output_modules_LayerNorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_9_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_9_modules_output_modules_LayerNorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_query_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_query_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_query_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_query_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_key_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_key_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_key_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_key_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_value_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_value_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_value_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_self_modules_value_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_dense_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_LayerNorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_10_modules_attention_modules_output_modules_LayerNorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_10_modules_intermediate_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_10_modules_intermediate_modules_dense_parameters_weight_" + shape = [4096, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_10_modules_intermediate_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_10_modules_intermediate_modules_dense_parameters_bias_" + shape = [4096] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_10_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_10_modules_output_modules_dense_parameters_weight_" + shape = [1024, 4096] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_10_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_10_modules_output_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_10_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_10_modules_output_modules_LayerNorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_10_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_10_modules_output_modules_LayerNorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_query_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_query_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_query_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_query_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_key_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_key_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_key_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_key_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_value_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_value_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_value_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_self_modules_value_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_dense_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_LayerNorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_11_modules_attention_modules_output_modules_LayerNorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_11_modules_intermediate_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_11_modules_intermediate_modules_dense_parameters_weight_" + shape = [4096, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_11_modules_intermediate_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_11_modules_intermediate_modules_dense_parameters_bias_" + shape = [4096] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_11_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_11_modules_output_modules_dense_parameters_weight_" + shape = [1024, 4096] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_11_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_11_modules_output_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_11_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_11_modules_output_modules_LayerNorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_11_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_11_modules_output_modules_LayerNorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_self_modules_query_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_self_modules_query_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_self_modules_query_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_self_modules_query_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_self_modules_key_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_self_modules_key_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_self_modules_key_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_self_modules_key_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_self_modules_value_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_self_modules_value_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_self_modules_value_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_self_modules_value_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_output_modules_dense_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_output_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_output_modules_LayerNorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_12_modules_attention_modules_output_modules_LayerNorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_12_modules_intermediate_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_12_modules_intermediate_modules_dense_parameters_weight_" + shape = [4096, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_12_modules_intermediate_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_12_modules_intermediate_modules_dense_parameters_bias_" + shape = [4096] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_12_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_12_modules_output_modules_dense_parameters_weight_" + shape = [1024, 4096] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_12_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_12_modules_output_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_12_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_12_modules_output_modules_LayerNorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_12_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_12_modules_output_modules_LayerNorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_self_modules_query_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_self_modules_query_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_self_modules_query_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_self_modules_query_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_self_modules_key_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_self_modules_key_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_self_modules_key_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_self_modules_key_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_self_modules_value_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_self_modules_value_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_self_modules_value_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_self_modules_value_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_output_modules_dense_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_output_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_output_modules_LayerNorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_13_modules_attention_modules_output_modules_LayerNorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_13_modules_intermediate_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_13_modules_intermediate_modules_dense_parameters_weight_" + shape = [4096, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_13_modules_intermediate_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_13_modules_intermediate_modules_dense_parameters_bias_" + shape = [4096] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_13_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_13_modules_output_modules_dense_parameters_weight_" + shape = [1024, 4096] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_13_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_13_modules_output_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_13_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_13_modules_output_modules_LayerNorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_13_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_13_modules_output_modules_LayerNorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_self_modules_query_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_self_modules_query_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_self_modules_query_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_self_modules_query_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_self_modules_key_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_self_modules_key_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_self_modules_key_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_self_modules_key_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_self_modules_value_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_self_modules_value_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_self_modules_value_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_self_modules_value_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_output_modules_dense_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_output_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_output_modules_LayerNorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_14_modules_attention_modules_output_modules_LayerNorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_14_modules_intermediate_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_14_modules_intermediate_modules_dense_parameters_weight_" + shape = [4096, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_14_modules_intermediate_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_14_modules_intermediate_modules_dense_parameters_bias_" + shape = [4096] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_14_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_14_modules_output_modules_dense_parameters_weight_" + shape = [1024, 4096] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_14_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_14_modules_output_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_14_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_14_modules_output_modules_LayerNorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_14_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_14_modules_output_modules_LayerNorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_self_modules_query_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_self_modules_query_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_self_modules_query_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_self_modules_query_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_self_modules_key_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_self_modules_key_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_self_modules_key_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_self_modules_key_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_self_modules_value_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_self_modules_value_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_self_modules_value_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_self_modules_value_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_output_modules_dense_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_output_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_output_modules_LayerNorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_15_modules_attention_modules_output_modules_LayerNorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_15_modules_intermediate_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_15_modules_intermediate_modules_dense_parameters_weight_" + shape = [4096, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_15_modules_intermediate_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_15_modules_intermediate_modules_dense_parameters_bias_" + shape = [4096] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_15_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_15_modules_output_modules_dense_parameters_weight_" + shape = [1024, 4096] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_15_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_15_modules_output_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_15_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_15_modules_output_modules_LayerNorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_15_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_15_modules_output_modules_LayerNorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_self_modules_query_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_self_modules_query_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_self_modules_query_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_self_modules_query_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_self_modules_key_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_self_modules_key_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_self_modules_key_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_self_modules_key_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_self_modules_value_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_self_modules_value_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_self_modules_value_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_self_modules_value_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_output_modules_dense_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_output_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_output_modules_LayerNorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_16_modules_attention_modules_output_modules_LayerNorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_16_modules_intermediate_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_16_modules_intermediate_modules_dense_parameters_weight_" + shape = [4096, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_16_modules_intermediate_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_16_modules_intermediate_modules_dense_parameters_bias_" + shape = [4096] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_16_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_16_modules_output_modules_dense_parameters_weight_" + shape = [1024, 4096] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_16_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_16_modules_output_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_16_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_16_modules_output_modules_LayerNorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_16_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_16_modules_output_modules_LayerNorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_self_modules_query_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_self_modules_query_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_self_modules_query_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_self_modules_query_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_self_modules_key_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_self_modules_key_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_self_modules_key_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_self_modules_key_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_self_modules_value_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_self_modules_value_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_self_modules_value_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_self_modules_value_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_output_modules_dense_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_output_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_output_modules_LayerNorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_17_modules_attention_modules_output_modules_LayerNorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_17_modules_intermediate_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_17_modules_intermediate_modules_dense_parameters_weight_" + shape = [4096, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_17_modules_intermediate_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_17_modules_intermediate_modules_dense_parameters_bias_" + shape = [4096] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_17_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_17_modules_output_modules_dense_parameters_weight_" + shape = [1024, 4096] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_17_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_17_modules_output_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_17_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_17_modules_output_modules_LayerNorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_17_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_17_modules_output_modules_LayerNorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_self_modules_query_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_self_modules_query_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_self_modules_query_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_self_modules_query_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_self_modules_key_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_self_modules_key_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_self_modules_key_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_self_modules_key_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_self_modules_value_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_self_modules_value_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_self_modules_value_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_self_modules_value_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_output_modules_dense_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_output_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_output_modules_LayerNorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_18_modules_attention_modules_output_modules_LayerNorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_18_modules_intermediate_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_18_modules_intermediate_modules_dense_parameters_weight_" + shape = [4096, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_18_modules_intermediate_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_18_modules_intermediate_modules_dense_parameters_bias_" + shape = [4096] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_18_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_18_modules_output_modules_dense_parameters_weight_" + shape = [1024, 4096] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_18_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_18_modules_output_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_18_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_18_modules_output_modules_LayerNorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_18_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_18_modules_output_modules_LayerNorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_self_modules_query_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_self_modules_query_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_self_modules_query_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_self_modules_query_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_self_modules_key_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_self_modules_key_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_self_modules_key_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_self_modules_key_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_self_modules_value_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_self_modules_value_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_self_modules_value_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_self_modules_value_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_output_modules_dense_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_output_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_output_modules_LayerNorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_19_modules_attention_modules_output_modules_LayerNorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_19_modules_intermediate_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_19_modules_intermediate_modules_dense_parameters_weight_" + shape = [4096, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_19_modules_intermediate_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_19_modules_intermediate_modules_dense_parameters_bias_" + shape = [4096] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_19_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_19_modules_output_modules_dense_parameters_weight_" + shape = [1024, 4096] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_19_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_19_modules_output_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_19_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_19_modules_output_modules_LayerNorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_19_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_19_modules_output_modules_LayerNorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_self_modules_query_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_self_modules_query_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_self_modules_query_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_self_modules_query_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_self_modules_key_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_self_modules_key_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_self_modules_key_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_self_modules_key_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_self_modules_value_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_self_modules_value_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_self_modules_value_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_self_modules_value_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_output_modules_dense_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_output_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_output_modules_LayerNorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_20_modules_attention_modules_output_modules_LayerNorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_20_modules_intermediate_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_20_modules_intermediate_modules_dense_parameters_weight_" + shape = [4096, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_20_modules_intermediate_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_20_modules_intermediate_modules_dense_parameters_bias_" + shape = [4096] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_20_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_20_modules_output_modules_dense_parameters_weight_" + shape = [1024, 4096] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_20_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_20_modules_output_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_20_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_20_modules_output_modules_LayerNorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_20_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_20_modules_output_modules_LayerNorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_self_modules_query_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_self_modules_query_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_self_modules_query_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_self_modules_query_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_self_modules_key_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_self_modules_key_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_self_modules_key_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_self_modules_key_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_self_modules_value_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_self_modules_value_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_self_modules_value_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_self_modules_value_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_output_modules_dense_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_output_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_output_modules_LayerNorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_21_modules_attention_modules_output_modules_LayerNorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_21_modules_intermediate_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_21_modules_intermediate_modules_dense_parameters_weight_" + shape = [4096, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_21_modules_intermediate_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_21_modules_intermediate_modules_dense_parameters_bias_" + shape = [4096] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_21_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_21_modules_output_modules_dense_parameters_weight_" + shape = [1024, 4096] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_21_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_21_modules_output_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_21_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_21_modules_output_modules_LayerNorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_21_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_21_modules_output_modules_LayerNorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_self_modules_query_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_self_modules_query_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_self_modules_query_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_self_modules_query_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_self_modules_key_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_self_modules_key_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_self_modules_key_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_self_modules_key_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_self_modules_value_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_self_modules_value_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_self_modules_value_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_self_modules_value_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_output_modules_dense_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_output_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_output_modules_LayerNorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_22_modules_attention_modules_output_modules_LayerNorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_22_modules_intermediate_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_22_modules_intermediate_modules_dense_parameters_weight_" + shape = [4096, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_22_modules_intermediate_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_22_modules_intermediate_modules_dense_parameters_bias_" + shape = [4096] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_22_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_22_modules_output_modules_dense_parameters_weight_" + shape = [1024, 4096] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_22_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_22_modules_output_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_22_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_22_modules_output_modules_LayerNorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_22_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_22_modules_output_modules_LayerNorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_self_modules_query_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_self_modules_query_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_self_modules_query_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_self_modules_query_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_self_modules_key_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_self_modules_key_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_self_modules_key_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_self_modules_key_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_self_modules_value_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_self_modules_value_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_self_modules_value_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_self_modules_value_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_output_modules_dense_parameters_weight_" + shape = [1024, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_output_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_output_modules_LayerNorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_23_modules_attention_modules_output_modules_LayerNorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_23_modules_intermediate_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_23_modules_intermediate_modules_dense_parameters_weight_" + shape = [4096, 1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_23_modules_intermediate_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_23_modules_intermediate_modules_dense_parameters_bias_" + shape = [4096] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_23_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_23_modules_output_modules_dense_parameters_weight_" + shape = [1024, 4096] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_23_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_23_modules_output_modules_dense_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_23_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_23_modules_output_modules_LayerNorm_parameters_weight_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_23_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_23_modules_output_modules_LayerNorm_parameters_bias_" + shape = [1024] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None diff --git a/samples/transformers-auto-model/xc2450_distilbert-portuguese-cased-finetuned-bec_classification/graph_hash.txt b/samples/transformers-auto-model/xc2450_distilbert-portuguese-cased-finetuned-bec_classification/graph_hash.txt new file mode 100644 index 000000000..29a2d00f5 --- /dev/null +++ b/samples/transformers-auto-model/xc2450_distilbert-portuguese-cased-finetuned-bec_classification/graph_hash.txt @@ -0,0 +1 @@ +1d45e17b5398f3bf09a2fb8445e652943ad2541449c267e7073a4fe7fcf3c4cd \ No newline at end of file diff --git a/samples/transformers-auto-model/xc2450_distilbert-portuguese-cased-finetuned-bec_classification/graph_net.json b/samples/transformers-auto-model/xc2450_distilbert-portuguese-cased-finetuned-bec_classification/graph_net.json new file mode 100644 index 000000000..b6ffe9f72 --- /dev/null +++ b/samples/transformers-auto-model/xc2450_distilbert-portuguese-cased-finetuned-bec_classification/graph_net.json @@ -0,0 +1,6 @@ +{ + "framework": "torch", + "num_devices_required": 1, + "num_nodes_required": 1, + "dynamic": false +} \ No newline at end of file diff --git a/samples/transformers-auto-model/xc2450_distilbert-portuguese-cased-finetuned-bec_classification/input_meta.py b/samples/transformers-auto-model/xc2450_distilbert-portuguese-cased-finetuned-bec_classification/input_meta.py new file mode 100644 index 000000000..e69de29bb diff --git a/samples/transformers-auto-model/xc2450_distilbert-portuguese-cased-finetuned-bec_classification/input_tensor_constraints.py b/samples/transformers-auto-model/xc2450_distilbert-portuguese-cased-finetuned-bec_classification/input_tensor_constraints.py new file mode 100644 index 000000000..e69de29bb diff --git a/samples/transformers-auto-model/xc2450_distilbert-portuguese-cased-finetuned-bec_classification/model.py b/samples/transformers-auto-model/xc2450_distilbert-portuguese-cased-finetuned-bec_classification/model.py new file mode 100644 index 000000000..274d99d36 --- /dev/null +++ b/samples/transformers-auto-model/xc2450_distilbert-portuguese-cased-finetuned-bec_classification/model.py @@ -0,0 +1,866 @@ +import torch + + +class GraphModule(torch.nn.Module): + def forward( + self, + L_input_ids_: torch.Tensor, + L_token_type_ids_: torch.Tensor, + L_self_modules_embeddings_buffers_position_ids_: torch.Tensor, + L_self_modules_embeddings_modules_word_embeddings_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_embeddings_modules_token_type_embeddings_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_embeddings_modules_position_embeddings_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_embeddings_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_embeddings_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_attention_mask_: torch.Tensor, + L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_query_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_query_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_key_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_key_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_intermediate_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_intermediate_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_query_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_query_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_key_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_key_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_intermediate_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_intermediate_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_query_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_query_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_key_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_key_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_intermediate_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_intermediate_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_query_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_query_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_key_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_key_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_intermediate_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_intermediate_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_query_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_query_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_key_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_key_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_4_modules_intermediate_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_4_modules_intermediate_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_4_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_4_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_4_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_4_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_query_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_query_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_key_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_key_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_value_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_value_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_5_modules_intermediate_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_5_modules_intermediate_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_5_modules_output_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_5_modules_output_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_5_modules_output_modules_LayerNorm_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_encoder_modules_layer_modules_5_modules_output_modules_LayerNorm_parameters_bias_: torch.nn.parameter.Parameter, + L_self_modules_pooler_modules_dense_parameters_weight_: torch.nn.parameter.Parameter, + L_self_modules_pooler_modules_dense_parameters_bias_: torch.nn.parameter.Parameter, + ): + l_input_ids_ = L_input_ids_ + l_token_type_ids_ = L_token_type_ids_ + l_self_modules_embeddings_buffers_position_ids_ = ( + L_self_modules_embeddings_buffers_position_ids_ + ) + l_self_modules_embeddings_modules_word_embeddings_parameters_weight_ = ( + L_self_modules_embeddings_modules_word_embeddings_parameters_weight_ + ) + l_self_modules_embeddings_modules_token_type_embeddings_parameters_weight_ = ( + L_self_modules_embeddings_modules_token_type_embeddings_parameters_weight_ + ) + l_self_modules_embeddings_modules_position_embeddings_parameters_weight_ = ( + L_self_modules_embeddings_modules_position_embeddings_parameters_weight_ + ) + l_self_modules_embeddings_modules_layer_norm_parameters_weight_ = ( + L_self_modules_embeddings_modules_LayerNorm_parameters_weight_ + ) + l_self_modules_embeddings_modules_layer_norm_parameters_bias_ = ( + L_self_modules_embeddings_modules_LayerNorm_parameters_bias_ + ) + l_attention_mask_ = L_attention_mask_ + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_query_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_query_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_query_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_query_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_key_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_key_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_key_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_key_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_value_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_value_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_value_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_value_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_0_modules_intermediate_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_0_modules_intermediate_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_0_modules_intermediate_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_0_modules_intermediate_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_0_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_0_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_query_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_query_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_query_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_query_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_key_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_key_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_key_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_key_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_value_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_value_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_value_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_value_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_1_modules_intermediate_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_1_modules_intermediate_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_1_modules_intermediate_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_1_modules_intermediate_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_1_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_1_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_query_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_query_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_query_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_query_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_key_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_key_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_key_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_key_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_value_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_value_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_value_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_value_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_2_modules_intermediate_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_2_modules_intermediate_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_2_modules_intermediate_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_2_modules_intermediate_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_2_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_2_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_2_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_2_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_query_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_query_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_query_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_query_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_key_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_key_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_key_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_key_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_value_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_value_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_value_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_value_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_3_modules_intermediate_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_3_modules_intermediate_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_3_modules_intermediate_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_3_modules_intermediate_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_3_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_3_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_3_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_3_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_query_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_query_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_query_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_query_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_key_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_key_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_key_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_key_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_value_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_value_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_value_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_value_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_4_modules_intermediate_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_4_modules_intermediate_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_4_modules_intermediate_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_4_modules_intermediate_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_4_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_4_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_4_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_4_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_4_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_4_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_4_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_4_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_query_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_query_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_query_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_query_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_key_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_key_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_key_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_key_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_value_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_value_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_value_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_value_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_5_modules_intermediate_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_5_modules_intermediate_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_5_modules_intermediate_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_5_modules_intermediate_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_5_modules_output_modules_dense_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_5_modules_output_modules_dense_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_5_modules_output_modules_dense_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_5_modules_output_modules_dense_parameters_bias_ + l_self_modules_encoder_modules_layer_modules_5_modules_output_modules_layer_norm_parameters_weight_ = L_self_modules_encoder_modules_layer_modules_5_modules_output_modules_LayerNorm_parameters_weight_ + l_self_modules_encoder_modules_layer_modules_5_modules_output_modules_layer_norm_parameters_bias_ = L_self_modules_encoder_modules_layer_modules_5_modules_output_modules_LayerNorm_parameters_bias_ + l_self_modules_pooler_modules_dense_parameters_weight_ = ( + L_self_modules_pooler_modules_dense_parameters_weight_ + ) + l_self_modules_pooler_modules_dense_parameters_bias_ = ( + L_self_modules_pooler_modules_dense_parameters_bias_ + ) + position_ids = l_self_modules_embeddings_buffers_position_ids_[ + (slice(None, None, None), slice(0, 18, None)) + ] + l_self_modules_embeddings_buffers_position_ids_ = None + inputs_embeds = torch.nn.functional.embedding( + l_input_ids_, + l_self_modules_embeddings_modules_word_embeddings_parameters_weight_, + 0, + None, + 2.0, + False, + False, + ) + l_input_ids_ = ( + l_self_modules_embeddings_modules_word_embeddings_parameters_weight_ + ) = None + token_type_embeddings = torch.nn.functional.embedding( + l_token_type_ids_, + l_self_modules_embeddings_modules_token_type_embeddings_parameters_weight_, + None, + None, + 2.0, + False, + False, + ) + l_token_type_ids_ = ( + l_self_modules_embeddings_modules_token_type_embeddings_parameters_weight_ + ) = None + embeddings = inputs_embeds + token_type_embeddings + inputs_embeds = token_type_embeddings = None + position_embeddings = torch.nn.functional.embedding( + position_ids, + l_self_modules_embeddings_modules_position_embeddings_parameters_weight_, + None, + None, + 2.0, + False, + False, + ) + position_ids = ( + l_self_modules_embeddings_modules_position_embeddings_parameters_weight_ + ) = None + embeddings += position_embeddings + embeddings_1 = embeddings + embeddings = position_embeddings = None + embeddings_2 = torch.nn.functional.layer_norm( + embeddings_1, + (768,), + l_self_modules_embeddings_modules_layer_norm_parameters_weight_, + l_self_modules_embeddings_modules_layer_norm_parameters_bias_, + 1e-12, + ) + embeddings_1 = ( + l_self_modules_embeddings_modules_layer_norm_parameters_weight_ + ) = l_self_modules_embeddings_modules_layer_norm_parameters_bias_ = None + embeddings_3 = torch.nn.functional.dropout(embeddings_2, 0.1, False, False) + embeddings_2 = None + getitem_1 = l_attention_mask_[ + (slice(None, None, None), None, None, slice(None, None, None)) + ] + l_attention_mask_ = None + expand = getitem_1.expand(1, 1, 18, 18) + getitem_1 = None + expanded_mask = expand.to(torch.float32) + expand = None + tensor = torch.tensor(1.0, dtype=torch.float32) + inverted_mask = tensor - expanded_mask + tensor = expanded_mask = None + to_1 = inverted_mask.to(torch.bool) + extended_attention_mask = inverted_mask.masked_fill( + to_1, -3.4028234663852886e38 + ) + inverted_mask = to_1 = None + linear = torch._C._nn.linear( + embeddings_3, + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_query_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_query_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_query_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_query_parameters_bias_ = (None) + view = linear.view(1, -1, 12, 64) + linear = None + query_layer = view.transpose(1, 2) + view = None + linear_1 = torch._C._nn.linear( + embeddings_3, + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_key_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_key_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_key_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_key_parameters_bias_ = (None) + view_1 = linear_1.view(1, -1, 12, 64) + linear_1 = None + key_layer = view_1.transpose(1, 2) + view_1 = None + linear_2 = torch._C._nn.linear( + embeddings_3, + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_value_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_value_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_value_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_value_parameters_bias_ = (None) + view_2 = linear_2.view(1, -1, 12, 64) + linear_2 = None + value_layer = view_2.transpose(1, 2) + view_2 = None + attn_output = torch._C._nn.scaled_dot_product_attention( + query_layer, + key_layer, + value_layer, + attn_mask=extended_attention_mask, + dropout_p=0.0, + is_causal=False, + ) + query_layer = key_layer = value_layer = None + attn_output_1 = attn_output.transpose(1, 2) + attn_output = None + attn_output_2 = attn_output_1.reshape(1, 18, 768) + attn_output_1 = None + hidden_states = torch._C._nn.linear( + attn_output_2, + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dense_parameters_bias_, + ) + attn_output_2 = l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_1 = torch.nn.functional.dropout(hidden_states, 0.1, False, False) + hidden_states = None + add_1 = hidden_states_1 + embeddings_3 + hidden_states_1 = embeddings_3 = None + hidden_states_2 = torch.nn.functional.layer_norm( + add_1, + (768,), + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_layer_norm_parameters_bias_, + 1e-12, + ) + add_1 = l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = (None) + hidden_states_3 = torch._C._nn.linear( + hidden_states_2, + l_self_modules_encoder_modules_layer_modules_0_modules_intermediate_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_0_modules_intermediate_modules_dense_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_0_modules_intermediate_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_0_modules_intermediate_modules_dense_parameters_bias_ = (None) + hidden_states_4 = torch._C._nn.gelu(hidden_states_3) + hidden_states_3 = None + hidden_states_5 = torch._C._nn.linear( + hidden_states_4, + l_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dense_parameters_bias_, + ) + hidden_states_4 = l_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_6 = torch.nn.functional.dropout( + hidden_states_5, 0.1, False, False + ) + hidden_states_5 = None + add_2 = hidden_states_6 + hidden_states_2 + hidden_states_6 = hidden_states_2 = None + hidden_states_7 = torch.nn.functional.layer_norm( + add_2, + (768,), + l_self_modules_encoder_modules_layer_modules_0_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_0_modules_output_modules_layer_norm_parameters_bias_, + 1e-12, + ) + add_2 = l_self_modules_encoder_modules_layer_modules_0_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_0_modules_output_modules_layer_norm_parameters_bias_ = (None) + linear_6 = torch._C._nn.linear( + hidden_states_7, + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_query_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_query_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_query_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_query_parameters_bias_ = (None) + view_3 = linear_6.view(1, -1, 12, 64) + linear_6 = None + query_layer_1 = view_3.transpose(1, 2) + view_3 = None + linear_7 = torch._C._nn.linear( + hidden_states_7, + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_key_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_key_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_key_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_key_parameters_bias_ = (None) + view_4 = linear_7.view(1, -1, 12, 64) + linear_7 = None + key_layer_1 = view_4.transpose(1, 2) + view_4 = None + linear_8 = torch._C._nn.linear( + hidden_states_7, + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_value_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_value_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_value_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_value_parameters_bias_ = (None) + view_5 = linear_8.view(1, -1, 12, 64) + linear_8 = None + value_layer_1 = view_5.transpose(1, 2) + view_5 = None + attn_output_3 = torch._C._nn.scaled_dot_product_attention( + query_layer_1, + key_layer_1, + value_layer_1, + attn_mask=extended_attention_mask, + dropout_p=0.0, + is_causal=False, + ) + query_layer_1 = key_layer_1 = value_layer_1 = None + attn_output_4 = attn_output_3.transpose(1, 2) + attn_output_3 = None + attn_output_5 = attn_output_4.reshape(1, 18, 768) + attn_output_4 = None + hidden_states_8 = torch._C._nn.linear( + attn_output_5, + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dense_parameters_bias_, + ) + attn_output_5 = l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_9 = torch.nn.functional.dropout( + hidden_states_8, 0.1, False, False + ) + hidden_states_8 = None + add_3 = hidden_states_9 + hidden_states_7 + hidden_states_9 = hidden_states_7 = None + hidden_states_10 = torch.nn.functional.layer_norm( + add_3, + (768,), + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_layer_norm_parameters_bias_, + 1e-12, + ) + add_3 = l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = (None) + hidden_states_11 = torch._C._nn.linear( + hidden_states_10, + l_self_modules_encoder_modules_layer_modules_1_modules_intermediate_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_1_modules_intermediate_modules_dense_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_1_modules_intermediate_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_1_modules_intermediate_modules_dense_parameters_bias_ = (None) + hidden_states_12 = torch._C._nn.gelu(hidden_states_11) + hidden_states_11 = None + hidden_states_13 = torch._C._nn.linear( + hidden_states_12, + l_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dense_parameters_bias_, + ) + hidden_states_12 = l_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_14 = torch.nn.functional.dropout( + hidden_states_13, 0.1, False, False + ) + hidden_states_13 = None + add_4 = hidden_states_14 + hidden_states_10 + hidden_states_14 = hidden_states_10 = None + hidden_states_15 = torch.nn.functional.layer_norm( + add_4, + (768,), + l_self_modules_encoder_modules_layer_modules_1_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_1_modules_output_modules_layer_norm_parameters_bias_, + 1e-12, + ) + add_4 = l_self_modules_encoder_modules_layer_modules_1_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_1_modules_output_modules_layer_norm_parameters_bias_ = (None) + linear_12 = torch._C._nn.linear( + hidden_states_15, + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_query_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_query_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_query_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_query_parameters_bias_ = (None) + view_6 = linear_12.view(1, -1, 12, 64) + linear_12 = None + query_layer_2 = view_6.transpose(1, 2) + view_6 = None + linear_13 = torch._C._nn.linear( + hidden_states_15, + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_key_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_key_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_key_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_key_parameters_bias_ = (None) + view_7 = linear_13.view(1, -1, 12, 64) + linear_13 = None + key_layer_2 = view_7.transpose(1, 2) + view_7 = None + linear_14 = torch._C._nn.linear( + hidden_states_15, + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_value_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_value_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_value_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_value_parameters_bias_ = (None) + view_8 = linear_14.view(1, -1, 12, 64) + linear_14 = None + value_layer_2 = view_8.transpose(1, 2) + view_8 = None + attn_output_6 = torch._C._nn.scaled_dot_product_attention( + query_layer_2, + key_layer_2, + value_layer_2, + attn_mask=extended_attention_mask, + dropout_p=0.0, + is_causal=False, + ) + query_layer_2 = key_layer_2 = value_layer_2 = None + attn_output_7 = attn_output_6.transpose(1, 2) + attn_output_6 = None + attn_output_8 = attn_output_7.reshape(1, 18, 768) + attn_output_7 = None + hidden_states_16 = torch._C._nn.linear( + attn_output_8, + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_dense_parameters_bias_, + ) + attn_output_8 = l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_17 = torch.nn.functional.dropout( + hidden_states_16, 0.1, False, False + ) + hidden_states_16 = None + add_5 = hidden_states_17 + hidden_states_15 + hidden_states_17 = hidden_states_15 = None + hidden_states_18 = torch.nn.functional.layer_norm( + add_5, + (768,), + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_layer_norm_parameters_bias_, + 1e-12, + ) + add_5 = l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = (None) + hidden_states_19 = torch._C._nn.linear( + hidden_states_18, + l_self_modules_encoder_modules_layer_modules_2_modules_intermediate_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_2_modules_intermediate_modules_dense_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_2_modules_intermediate_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_2_modules_intermediate_modules_dense_parameters_bias_ = (None) + hidden_states_20 = torch._C._nn.gelu(hidden_states_19) + hidden_states_19 = None + hidden_states_21 = torch._C._nn.linear( + hidden_states_20, + l_self_modules_encoder_modules_layer_modules_2_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_2_modules_output_modules_dense_parameters_bias_, + ) + hidden_states_20 = l_self_modules_encoder_modules_layer_modules_2_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_2_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_22 = torch.nn.functional.dropout( + hidden_states_21, 0.1, False, False + ) + hidden_states_21 = None + add_6 = hidden_states_22 + hidden_states_18 + hidden_states_22 = hidden_states_18 = None + hidden_states_23 = torch.nn.functional.layer_norm( + add_6, + (768,), + l_self_modules_encoder_modules_layer_modules_2_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_2_modules_output_modules_layer_norm_parameters_bias_, + 1e-12, + ) + add_6 = l_self_modules_encoder_modules_layer_modules_2_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_2_modules_output_modules_layer_norm_parameters_bias_ = (None) + linear_18 = torch._C._nn.linear( + hidden_states_23, + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_query_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_query_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_query_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_query_parameters_bias_ = (None) + view_9 = linear_18.view(1, -1, 12, 64) + linear_18 = None + query_layer_3 = view_9.transpose(1, 2) + view_9 = None + linear_19 = torch._C._nn.linear( + hidden_states_23, + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_key_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_key_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_key_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_key_parameters_bias_ = (None) + view_10 = linear_19.view(1, -1, 12, 64) + linear_19 = None + key_layer_3 = view_10.transpose(1, 2) + view_10 = None + linear_20 = torch._C._nn.linear( + hidden_states_23, + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_value_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_value_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_value_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_value_parameters_bias_ = (None) + view_11 = linear_20.view(1, -1, 12, 64) + linear_20 = None + value_layer_3 = view_11.transpose(1, 2) + view_11 = None + attn_output_9 = torch._C._nn.scaled_dot_product_attention( + query_layer_3, + key_layer_3, + value_layer_3, + attn_mask=extended_attention_mask, + dropout_p=0.0, + is_causal=False, + ) + query_layer_3 = key_layer_3 = value_layer_3 = None + attn_output_10 = attn_output_9.transpose(1, 2) + attn_output_9 = None + attn_output_11 = attn_output_10.reshape(1, 18, 768) + attn_output_10 = None + hidden_states_24 = torch._C._nn.linear( + attn_output_11, + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_dense_parameters_bias_, + ) + attn_output_11 = l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_25 = torch.nn.functional.dropout( + hidden_states_24, 0.1, False, False + ) + hidden_states_24 = None + add_7 = hidden_states_25 + hidden_states_23 + hidden_states_25 = hidden_states_23 = None + hidden_states_26 = torch.nn.functional.layer_norm( + add_7, + (768,), + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_layer_norm_parameters_bias_, + 1e-12, + ) + add_7 = l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = (None) + hidden_states_27 = torch._C._nn.linear( + hidden_states_26, + l_self_modules_encoder_modules_layer_modules_3_modules_intermediate_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_3_modules_intermediate_modules_dense_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_3_modules_intermediate_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_3_modules_intermediate_modules_dense_parameters_bias_ = (None) + hidden_states_28 = torch._C._nn.gelu(hidden_states_27) + hidden_states_27 = None + hidden_states_29 = torch._C._nn.linear( + hidden_states_28, + l_self_modules_encoder_modules_layer_modules_3_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_3_modules_output_modules_dense_parameters_bias_, + ) + hidden_states_28 = l_self_modules_encoder_modules_layer_modules_3_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_3_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_30 = torch.nn.functional.dropout( + hidden_states_29, 0.1, False, False + ) + hidden_states_29 = None + add_8 = hidden_states_30 + hidden_states_26 + hidden_states_30 = hidden_states_26 = None + hidden_states_31 = torch.nn.functional.layer_norm( + add_8, + (768,), + l_self_modules_encoder_modules_layer_modules_3_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_3_modules_output_modules_layer_norm_parameters_bias_, + 1e-12, + ) + add_8 = l_self_modules_encoder_modules_layer_modules_3_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_3_modules_output_modules_layer_norm_parameters_bias_ = (None) + linear_24 = torch._C._nn.linear( + hidden_states_31, + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_query_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_query_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_query_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_query_parameters_bias_ = (None) + view_12 = linear_24.view(1, -1, 12, 64) + linear_24 = None + query_layer_4 = view_12.transpose(1, 2) + view_12 = None + linear_25 = torch._C._nn.linear( + hidden_states_31, + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_key_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_key_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_key_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_key_parameters_bias_ = (None) + view_13 = linear_25.view(1, -1, 12, 64) + linear_25 = None + key_layer_4 = view_13.transpose(1, 2) + view_13 = None + linear_26 = torch._C._nn.linear( + hidden_states_31, + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_value_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_value_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_value_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_value_parameters_bias_ = (None) + view_14 = linear_26.view(1, -1, 12, 64) + linear_26 = None + value_layer_4 = view_14.transpose(1, 2) + view_14 = None + attn_output_12 = torch._C._nn.scaled_dot_product_attention( + query_layer_4, + key_layer_4, + value_layer_4, + attn_mask=extended_attention_mask, + dropout_p=0.0, + is_causal=False, + ) + query_layer_4 = key_layer_4 = value_layer_4 = None + attn_output_13 = attn_output_12.transpose(1, 2) + attn_output_12 = None + attn_output_14 = attn_output_13.reshape(1, 18, 768) + attn_output_13 = None + hidden_states_32 = torch._C._nn.linear( + attn_output_14, + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_dense_parameters_bias_, + ) + attn_output_14 = l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_33 = torch.nn.functional.dropout( + hidden_states_32, 0.1, False, False + ) + hidden_states_32 = None + add_9 = hidden_states_33 + hidden_states_31 + hidden_states_33 = hidden_states_31 = None + hidden_states_34 = torch.nn.functional.layer_norm( + add_9, + (768,), + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_layer_norm_parameters_bias_, + 1e-12, + ) + add_9 = l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = (None) + hidden_states_35 = torch._C._nn.linear( + hidden_states_34, + l_self_modules_encoder_modules_layer_modules_4_modules_intermediate_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_4_modules_intermediate_modules_dense_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_4_modules_intermediate_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_4_modules_intermediate_modules_dense_parameters_bias_ = (None) + hidden_states_36 = torch._C._nn.gelu(hidden_states_35) + hidden_states_35 = None + hidden_states_37 = torch._C._nn.linear( + hidden_states_36, + l_self_modules_encoder_modules_layer_modules_4_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_4_modules_output_modules_dense_parameters_bias_, + ) + hidden_states_36 = l_self_modules_encoder_modules_layer_modules_4_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_4_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_38 = torch.nn.functional.dropout( + hidden_states_37, 0.1, False, False + ) + hidden_states_37 = None + add_10 = hidden_states_38 + hidden_states_34 + hidden_states_38 = hidden_states_34 = None + hidden_states_39 = torch.nn.functional.layer_norm( + add_10, + (768,), + l_self_modules_encoder_modules_layer_modules_4_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_4_modules_output_modules_layer_norm_parameters_bias_, + 1e-12, + ) + add_10 = l_self_modules_encoder_modules_layer_modules_4_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_4_modules_output_modules_layer_norm_parameters_bias_ = (None) + linear_30 = torch._C._nn.linear( + hidden_states_39, + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_query_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_query_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_query_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_query_parameters_bias_ = (None) + view_15 = linear_30.view(1, -1, 12, 64) + linear_30 = None + query_layer_5 = view_15.transpose(1, 2) + view_15 = None + linear_31 = torch._C._nn.linear( + hidden_states_39, + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_key_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_key_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_key_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_key_parameters_bias_ = (None) + view_16 = linear_31.view(1, -1, 12, 64) + linear_31 = None + key_layer_5 = view_16.transpose(1, 2) + view_16 = None + linear_32 = torch._C._nn.linear( + hidden_states_39, + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_value_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_value_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_value_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_value_parameters_bias_ = (None) + view_17 = linear_32.view(1, -1, 12, 64) + linear_32 = None + value_layer_5 = view_17.transpose(1, 2) + view_17 = None + attn_output_15 = torch._C._nn.scaled_dot_product_attention( + query_layer_5, + key_layer_5, + value_layer_5, + attn_mask=extended_attention_mask, + dropout_p=0.0, + is_causal=False, + ) + query_layer_5 = key_layer_5 = value_layer_5 = extended_attention_mask = None + attn_output_16 = attn_output_15.transpose(1, 2) + attn_output_15 = None + attn_output_17 = attn_output_16.reshape(1, 18, 768) + attn_output_16 = None + hidden_states_40 = torch._C._nn.linear( + attn_output_17, + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_dense_parameters_bias_, + ) + attn_output_17 = l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_41 = torch.nn.functional.dropout( + hidden_states_40, 0.1, False, False + ) + hidden_states_40 = None + add_11 = hidden_states_41 + hidden_states_39 + hidden_states_41 = hidden_states_39 = None + hidden_states_42 = torch.nn.functional.layer_norm( + add_11, + (768,), + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_layer_norm_parameters_bias_, + 1e-12, + ) + add_11 = l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_layer_norm_parameters_bias_ = (None) + hidden_states_43 = torch._C._nn.linear( + hidden_states_42, + l_self_modules_encoder_modules_layer_modules_5_modules_intermediate_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_5_modules_intermediate_modules_dense_parameters_bias_, + ) + l_self_modules_encoder_modules_layer_modules_5_modules_intermediate_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_5_modules_intermediate_modules_dense_parameters_bias_ = (None) + hidden_states_44 = torch._C._nn.gelu(hidden_states_43) + hidden_states_43 = None + hidden_states_45 = torch._C._nn.linear( + hidden_states_44, + l_self_modules_encoder_modules_layer_modules_5_modules_output_modules_dense_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_5_modules_output_modules_dense_parameters_bias_, + ) + hidden_states_44 = l_self_modules_encoder_modules_layer_modules_5_modules_output_modules_dense_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_5_modules_output_modules_dense_parameters_bias_ = (None) + hidden_states_46 = torch.nn.functional.dropout( + hidden_states_45, 0.1, False, False + ) + hidden_states_45 = None + add_12 = hidden_states_46 + hidden_states_42 + hidden_states_46 = hidden_states_42 = None + hidden_states_47 = torch.nn.functional.layer_norm( + add_12, + (768,), + l_self_modules_encoder_modules_layer_modules_5_modules_output_modules_layer_norm_parameters_weight_, + l_self_modules_encoder_modules_layer_modules_5_modules_output_modules_layer_norm_parameters_bias_, + 1e-12, + ) + add_12 = l_self_modules_encoder_modules_layer_modules_5_modules_output_modules_layer_norm_parameters_weight_ = l_self_modules_encoder_modules_layer_modules_5_modules_output_modules_layer_norm_parameters_bias_ = (None) + first_token_tensor = hidden_states_47[(slice(None, None, None), 0)] + pooled_output = torch._C._nn.linear( + first_token_tensor, + l_self_modules_pooler_modules_dense_parameters_weight_, + l_self_modules_pooler_modules_dense_parameters_bias_, + ) + first_token_tensor = ( + l_self_modules_pooler_modules_dense_parameters_weight_ + ) = l_self_modules_pooler_modules_dense_parameters_bias_ = None + pooled_output_1 = torch.tanh(pooled_output) + pooled_output = None + return (hidden_states_47, pooled_output_1) diff --git a/samples/transformers-auto-model/xc2450_distilbert-portuguese-cased-finetuned-bec_classification/weight_meta.py b/samples/transformers-auto-model/xc2450_distilbert-portuguese-cased-finetuned-bec_classification/weight_meta.py new file mode 100644 index 000000000..8f5bcd887 --- /dev/null +++ b/samples/transformers-auto-model/xc2450_distilbert-portuguese-cased-finetuned-bec_classification/weight_meta.py @@ -0,0 +1,1088 @@ +class Program_weight_tensor_meta_L_input_ids_: + name = "L_input_ids_" + shape = [1, 18] + dtype = "torch.int64" + device = "cuda:0" + mean = None + std = None + data = [ + 101, + 1116, + 1814, + 5116, + 22286, + 847, + 727, + 430, + 374, + 3084, + 5965, + 14074, + 3982, + 416, + 1814, + 22281, + 119, + 102, + ] + + +class Program_weight_tensor_meta_L_token_type_ids_: + name = "L_token_type_ids_" + shape = [1, 18] + dtype = "torch.int64" + device = "cuda:0" + mean = None + std = None + data = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] + + +class Program_weight_tensor_meta_L_self_modules_embeddings_buffers_position_ids_: + name = "L_self_modules_embeddings_buffers_position_ids_" + shape = [1, 512] + dtype = "torch.int64" + device = "cuda:0" + mean = None + std = None + min_val = 0 + max_val = 511 + + +class Program_weight_tensor_meta_L_self_modules_embeddings_modules_word_embeddings_parameters_weight_: + name = "L_self_modules_embeddings_modules_word_embeddings_parameters_weight_" + shape = [29794, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_embeddings_modules_token_type_embeddings_parameters_weight_: + name = "L_self_modules_embeddings_modules_token_type_embeddings_parameters_weight_" + shape = [2, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_embeddings_modules_position_embeddings_parameters_weight_: + name = "L_self_modules_embeddings_modules_position_embeddings_parameters_weight_" + shape = [512, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_embeddings_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_embeddings_modules_LayerNorm_parameters_weight_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_embeddings_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_embeddings_modules_LayerNorm_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_attention_mask_: + name = "L_attention_mask_" + shape = [1, 18] + dtype = "torch.int64" + device = "cuda:0" + mean = None + std = None + data = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_query_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_query_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_query_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_query_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_key_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_key_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_key_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_key_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_value_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_value_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_value_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_self_modules_value_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dense_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_dense_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_LayerNorm_parameters_weight_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_attention_modules_output_modules_LayerNorm_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_intermediate_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_intermediate_modules_dense_parameters_weight_" + shape = [3072, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_intermediate_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_intermediate_modules_dense_parameters_bias_" + shape = [3072] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dense_parameters_weight_" + shape = [768, 3072] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_dense_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_LayerNorm_parameters_weight_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_0_modules_output_modules_LayerNorm_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_query_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_query_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_query_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_query_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_key_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_key_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_key_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_key_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_value_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_value_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_value_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_self_modules_value_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dense_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_dense_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_LayerNorm_parameters_weight_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_attention_modules_output_modules_LayerNorm_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_intermediate_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_intermediate_modules_dense_parameters_weight_" + shape = [3072, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_intermediate_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_intermediate_modules_dense_parameters_bias_" + shape = [3072] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dense_parameters_weight_" + shape = [768, 3072] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_dense_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_LayerNorm_parameters_weight_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_1_modules_output_modules_LayerNorm_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_query_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_query_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_query_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_query_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_key_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_key_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_key_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_key_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_value_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_value_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_value_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_self_modules_value_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_dense_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_dense_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_LayerNorm_parameters_weight_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_attention_modules_output_modules_LayerNorm_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_intermediate_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_intermediate_modules_dense_parameters_weight_" + shape = [3072, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_intermediate_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_intermediate_modules_dense_parameters_bias_" + shape = [3072] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_dense_parameters_weight_" + shape = [768, 3072] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_dense_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_LayerNorm_parameters_weight_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_2_modules_output_modules_LayerNorm_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_query_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_query_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_query_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_query_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_key_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_key_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_key_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_key_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_value_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_value_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_value_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_self_modules_value_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_dense_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_dense_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_LayerNorm_parameters_weight_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_attention_modules_output_modules_LayerNorm_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_intermediate_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_intermediate_modules_dense_parameters_weight_" + shape = [3072, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_intermediate_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_intermediate_modules_dense_parameters_bias_" + shape = [3072] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_dense_parameters_weight_" + shape = [768, 3072] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_dense_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_LayerNorm_parameters_weight_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_3_modules_output_modules_LayerNorm_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_query_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_query_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_query_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_query_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_key_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_key_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_key_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_key_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_value_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_value_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_value_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_self_modules_value_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_dense_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_dense_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_LayerNorm_parameters_weight_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_4_modules_attention_modules_output_modules_LayerNorm_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_4_modules_intermediate_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_4_modules_intermediate_modules_dense_parameters_weight_" + shape = [3072, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_4_modules_intermediate_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_4_modules_intermediate_modules_dense_parameters_bias_" + shape = [3072] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_4_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_4_modules_output_modules_dense_parameters_weight_" + shape = [768, 3072] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_4_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_4_modules_output_modules_dense_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_4_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_4_modules_output_modules_LayerNorm_parameters_weight_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_4_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_4_modules_output_modules_LayerNorm_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_query_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_query_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_query_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_query_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_key_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_key_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_key_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_key_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_value_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_value_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_value_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_self_modules_value_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_dense_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_dense_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_LayerNorm_parameters_weight_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_5_modules_attention_modules_output_modules_LayerNorm_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_5_modules_intermediate_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_5_modules_intermediate_modules_dense_parameters_weight_" + shape = [3072, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_5_modules_intermediate_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_5_modules_intermediate_modules_dense_parameters_bias_" + shape = [3072] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_5_modules_output_modules_dense_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_5_modules_output_modules_dense_parameters_weight_" + shape = [768, 3072] + dtype = "torch.float32" + device = "cuda:0" + mean = -0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_5_modules_output_modules_dense_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_5_modules_output_modules_dense_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_5_modules_output_modules_LayerNorm_parameters_weight_: + name = "L_self_modules_encoder_modules_layer_modules_5_modules_output_modules_LayerNorm_parameters_weight_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 1.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_encoder_modules_layer_modules_5_modules_output_modules_LayerNorm_parameters_bias_: + name = "L_self_modules_encoder_modules_layer_modules_5_modules_output_modules_LayerNorm_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None + + +class Program_weight_tensor_meta_L_self_modules_pooler_modules_dense_parameters_weight_: + name = "L_self_modules_pooler_modules_dense_parameters_weight_" + shape = [768, 768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.020 + data = None + + +class Program_weight_tensor_meta_L_self_modules_pooler_modules_dense_parameters_bias_: + name = "L_self_modules_pooler_modules_dense_parameters_bias_" + shape = [768] + dtype = "torch.float32" + device = "cuda:0" + mean = 0.000 + std = 0.000 + data = None