Merge pull request #48 from dhruvdcoder/jaylee-patch

Made changes for applying ranking loss directly to the TaskNN
iesl · Sep 26, 2021 · 58e6596 · 58e6596
2 parents 8dcdeff + 962d158
commit 58e6596
Show file tree

Hide file tree

Showing 22 changed files with 1,489 additions and 7 deletions.
diff --git a/data/.gitkeep b/data/.gitkeep
diff --git a/.../multilabel_classification/v2.5/eurlex_ranking_discrete_on_scorenn_tasknn_reverse.jsonnet b/.../multilabel_classification/v2.5/eurlex_ranking_discrete_on_scorenn_tasknn_reverse.jsonnet
@@ -0,0 +1,189 @@
+local test = std.extVar('TEST'); // a test run with small dataset
+local data_dir = std.extVar('DATA_DIR');
+local cuda_device = std.extVar('CUDA_DEVICE');
+local use_wandb = (if test == '1' then false else true);
+
+local dataset_name = std.parseJson(std.extVar('dataset_name'));
+local dataset_metadata = (import '../datasets.jsonnet')[dataset_name];
+local num_labels = dataset_metadata.num_labels;
+local num_input_features = dataset_metadata.input_features;
+
+// model variables
+local ff_hidden = std.parseJson(std.extVar('ff_hidden'));
+local label_space_dim = ff_hidden;
+local ff_dropout = std.parseJson(std.extVar('ff_dropout_10x')) / 10.0;
+local ff_activation = 'softplus';
+local ff_linear_layers = std.parseJson(std.extVar('ff_linear_layers'));
+local ff_weight_decay = std.parseJson(std.extVar('ff_weight_decay'));
+local global_score_hidden_dim = std.parseJson(std.extVar('global_score_hidden_dim'));
+local gain = (if ff_activation == 'tanh' then 5 / 3 else 1);
+local cross_entropy_loss_weight = std.parseJson(std.extVar('cross_entropy_loss_weight'));
+local dvn_score_loss_weight = std.parseJson(std.extVar('dvn_score_loss_weight'));
+local task_temp = std.parseJson(std.extVar('task_nn_steps')); # variable for task_nn.steps
+local task_nn_steps = (if std.toString(task_temp) == '0' then 1 else task_temp);
+local score_temp = std.parseJson(std.extVar('score_nn_steps')); # variable for score_nn.steps
+local score_nn_steps = (if std.toString(score_temp) == '0' then 1 else score_temp);
+{
+ [if use_wandb then 'type']: 'train_test_log_to_wandb',
+ evaluate_on_test: true,
+ // Data
+ dataset_reader: {
+ type: 'arff',
+ num_labels: num_labels,
+ },
+ validation_dataset_reader: {
+ type: 'arff',
+ num_labels: num_labels,
+ },
+ train_data_path: (data_dir + '/' + dataset_metadata.dir_name + '/' +
+ dataset_metadata.train_file),
+ validation_data_path: (data_dir + '/' + dataset_metadata.dir_name + '/' +
+ dataset_metadata.validation_file),
+ test_data_path: (data_dir + '/' + dataset_metadata.dir_name + '/' +
+ dataset_metadata.test_file),
+
+ // Model
+ model: {
+ type: 'multi-label-classification-with-infnet',
+ sampler: {
+ type: 'appending-container',
+ log_key: 'sampler',
+ constituent_samplers: [],
+ },
+ task_nn: {
+ type: 'multi-label-classification',
+ feature_network: {
+ input_dim: num_input_features,
+ num_layers: ff_linear_layers,
+ activations: ([ff_activation for i in std.range(0, ff_linear_layers - 2)] + [ff_activation]),
+ hidden_dims: ff_hidden,
+ dropout: ([ff_dropout for i in std.range(0, ff_linear_layers - 2)] + [0]),
+ },
+ label_embeddings: {
+ embedding_dim: ff_hidden,
+ vocab_namespace: 'labels',
+ },
+ },
+ inference_module: {
+ type: 'multi-label-inference-net-normalized',
+ log_key: 'inference_module',
+ loss_fn: {
+ type: 'combination-loss',
+ log_key: 'loss',
+ constituent_losses: [
+ {
+ type: 'multi-label-score-loss',
+ log_key: 'neg.nce_score',
+ normalize_y: true,
+ reduction: 'none',
+ }, //This loss can be different from the main loss // change this
+ {
+ type: 'multi-label-bce',
+ reduction: 'none',
+ log_key: 'bce',
+ },
+ ],
+ loss_weights: [dvn_score_loss_weight, cross_entropy_loss_weight],
+ reduction: 'mean',
+ },
+ },
+ oracle_value_function: { type: 'per-instance-f1', differentiable: false },
+ score_nn: {
+ type: 'multi-label-classification',
+ task_nn: {
+ type: 'multi-label-classification',
+ feature_network: {
+ input_dim: num_input_features,
+ num_layers: ff_linear_layers,
+ activations: ([ff_activation for i in std.range(0, ff_linear_layers - 2)] + [ff_activation]),
+ hidden_dims: ff_hidden,
+ dropout: ([ff_dropout for i in std.range(0, ff_linear_layers - 2)] + [0]),
+ },
+ label_embeddings: {
+ embedding_dim: ff_hidden,
+ vocab_namespace: 'labels',
+ },
+ },
+ global_score: {
+ type: 'multi-label-feedforward',
+ feedforward: {
+ input_dim: num_labels,
+ num_layers: 1,
+ activations: ff_activation,
+ hidden_dims: global_score_hidden_dim,
+ },
+ },
+ },
+ loss_fn: {
+ type: 'multi-label-nce-ranking-with-discrete-sampling',
+ log_key: 'nce',
+ num_samples: 10,
+ sign: '-',
+ },
+ initializer: {
+ regexes: [
+ //[@'.*_feedforward._linear_layers.0.weight', {type: 'normal'}],
+ [@'.*_linear_layers.*weight', (if std.member(['tanh', 'sigmoid'], ff_activation) then { type: 'xavier_uniform', gain: gain } else { type: 'kaiming_uniform', nonlinearity: 'relu' })],
+ [@'.*linear_layers.*bias', { type: 'zero' }],
+ ],
+ },
+ },
+ data_loader: {
+ shuffle: true,
+ batch_size: 32,
+ },
+ trainer: {
+ type: 'gradient_descent_minimax',
+ num_epochs: if test == '1' then 10 else 300,
+ grad_norm: { task_nn: 10.0 },
+ patience: 20,
+ validation_metric: '+fixed_f1',
+ cuda_device: std.parseInt(cuda_device),
+ learning_rate_schedulers: {
+ task_nn: {
+ type: 'reduce_on_plateau',
+ factor: 0.5,
+ mode: 'max',
+ patience: 5,
+ verbose: true,
+ },
+ },
+ optimizer: {
+ optimizers: {
+ task_nn:
+ {
+ lr: 0.001,
+ weight_decay: ff_weight_decay,
+ type: 'adamw',
+ },
+ score_nn: {
+ lr: 0.005,
+ weight_decay: ff_weight_decay,
+ type: 'adamw',
+ },
+ },
+ },
+ checkpointer: {
+ keep_most_recent_by_count: 1,
+ },
+ callbacks: [
+ 'track_epoch_callback',
+ 'slurm',
+ ] + (
+ if use_wandb then [
+ {
+ type: 'wandb_allennlp',
+ sub_callbacks: [{ type: 'log_best_validation_metrics', priority: 100 }],
+ save_model_archive: false,
+ },
+ ]
+ else []
+ ),
+ inner_mode: 'score_nn',
+ num_steps: { task_nn: task_nn_steps, score_nn: score_nn_steps },
+ },
+ vocabulary: {
+ type: "from_files", 
+ directory: (data_dir + '/' + dataset_metadata.dir_name + '/' + 'eurlex-ev-vocab'),
+ } 
+}
diff --git a/...multilabel_classification/v2.5/gendata_ranking_discrete_on_scorenn_tasknn_reverse.jsonnet b/...multilabel_classification/v2.5/gendata_ranking_discrete_on_scorenn_tasknn_reverse.jsonnet
@@ -0,0 +1,186 @@
+local test = std.extVar('TEST'); // a test run with small dataset
+local data_dir = std.extVar('DATA_DIR');
+local cuda_device = std.extVar('CUDA_DEVICE');
+local use_wandb = (if test == '1' then false else true);
+
+local dataset_name = std.parseJson(std.extVar('dataset_name'));
+local dataset_metadata = (import '../datasets.jsonnet')[dataset_name];
+local num_labels = dataset_metadata.num_labels;
+local num_input_features = dataset_metadata.input_features;
+
+// model variables
+local ff_hidden = std.parseJson(std.extVar('ff_hidden'));
+local label_space_dim = ff_hidden;
+local ff_dropout = std.parseJson(std.extVar('ff_dropout_10x')) / 10.0;
+local ff_activation = 'softplus';
+local ff_linear_layers = std.parseJson(std.extVar('ff_linear_layers'));
+local ff_weight_decay = std.parseJson(std.extVar('ff_weight_decay'));
+local global_score_hidden_dim = std.parseJson(std.extVar('global_score_hidden_dim'));
+local gain = (if ff_activation == 'tanh' then 5 / 3 else 1);
+local cross_entropy_loss_weight = std.parseJson(std.extVar('cross_entropy_loss_weight'));
+local dvn_score_loss_weight = std.parseJson(std.extVar('dvn_score_loss_weight'));
+local task_temp = std.parseJson(std.extVar('task_nn_steps')); # variable for task_nn.steps
+local task_nn_steps = (if std.toString(task_temp) == '0' then 1 else task_temp);
+local score_temp = std.parseJson(std.extVar('score_nn_steps')); # variable for score_nn.steps
+local score_nn_steps = (if std.toString(score_temp) == '0' then 1 else score_temp);
+{
+ [if use_wandb then 'type']: 'train_test_log_to_wandb',
+ evaluate_on_test: true,
+ // Data
+ dataset_reader: {
+ type: 'arff',
+ num_labels: num_labels,
+ },
+ validation_dataset_reader: {
+ type: 'arff',
+ num_labels: num_labels,
+ },
+ train_data_path: (data_dir + '/' + dataset_metadata.dir_name + '/' +
+ dataset_metadata.train_file),
+ validation_data_path: (data_dir + '/' + dataset_metadata.dir_name + '/' +
+ dataset_metadata.validation_file),
+ test_data_path: (data_dir + '/' + dataset_metadata.dir_name + '/' +
+ dataset_metadata.test_file),
+
+ // Model
+ model: {
+ type: 'multi-label-classification-with-infnet',
+ sampler: {
+ type: 'appending-container',
+ log_key: 'sampler',
+ constituent_samplers: [],
+ },
+ task_nn: {
+ type: 'multi-label-classification',
+ feature_network: {
+ input_dim: num_input_features,
+ num_layers: ff_linear_layers,
+ activations: ([ff_activation for i in std.range(0, ff_linear_layers - 2)] + [ff_activation]),
+ hidden_dims: ff_hidden,
+ dropout: ([ff_dropout for i in std.range(0, ff_linear_layers - 2)] + [0]),
+ },
+ label_embeddings: {
+ embedding_dim: ff_hidden,
+ vocab_namespace: 'labels',
+ },
+ },
+ inference_module: {
+ type: 'multi-label-inference-net-normalized',
+ log_key: 'inference_module',
+ loss_fn: {
+ type: 'combination-loss',
+ log_key: 'loss',
+ constituent_losses: [
+ {
+ type: 'multi-label-score-loss',
+ log_key: 'neg.nce_score',
+ normalize_y: true,
+ reduction: 'none',
+ }, //This loss can be different from the main loss // change this
+ {
+ type: 'multi-label-bce',
+ reduction: 'none',
+ log_key: 'bce',
+ },
+ ],
+ loss_weights: [dvn_score_loss_weight, cross_entropy_loss_weight],
+ reduction: 'mean',
+ },
+ },
+ oracle_value_function: { type: 'per-instance-f1', differentiable: false },
+ score_nn: {
+ type: 'multi-label-classification',
+ task_nn: {
+ type: 'multi-label-classification',
+ feature_network: {
+ input_dim: num_input_features,
+ num_layers: ff_linear_layers,
+ activations: ([ff_activation for i in std.range(0, ff_linear_layers - 2)] + [ff_activation]),
+ hidden_dims: ff_hidden,
+ dropout: ([ff_dropout for i in std.range(0, ff_linear_layers - 2)] + [0]),
+ },
+ label_embeddings: {
+ embedding_dim: ff_hidden,
+ vocab_namespace: 'labels',
+ },
+ },
+ global_score: {
+ type: 'multi-label-feedforward',
+ feedforward: {
+ input_dim: num_labels,
+ num_layers: 1,
+ activations: ff_activation,
+ hidden_dims: global_score_hidden_dim,
+ },
+ },
+ },
+ loss_fn: {
+ type: 'multi-label-nce-ranking-with-discrete-sampling',
+ log_key: 'nce',
+ num_samples: 10,
+ sign: '-',
+ use_distance: false,
+ },
+ initializer: {
+ regexes: [
+ //[@'.*_feedforward._linear_layers.0.weight', {type: 'normal'}],
+ [@'.*_linear_layers.*weight', (if std.member(['tanh', 'sigmoid'], ff_activation) then { type: 'xavier_uniform', gain: gain } else { type: 'kaiming_uniform', nonlinearity: 'relu' })],
+ [@'.*linear_layers.*bias', { type: 'zero' }],
+ ],
+ },
+ },
+ data_loader: {
+ shuffle: true,
+ batch_size: 32,
+ },
+ trainer: {
+ type: 'gradient_descent_minimax',
+ num_epochs: if test == '1' then 10 else 300,
+ grad_norm: { task_nn: 10.0 },
+ patience: 20,
+ validation_metric: '+fixed_f1',
+ cuda_device: std.parseInt(cuda_device),
+ learning_rate_schedulers: {
+ task_nn: {
+ type: 'reduce_on_plateau',
+ factor: 0.5,
+ mode: 'max',
+ patience: 5,
+ verbose: true,
+ },
+ },
+ optimizer: {
+ optimizers: {
+ task_nn:
+ {
+ lr: 0.001,
+ weight_decay: ff_weight_decay,
+ type: 'adamw',
+ },
+ score_nn: {
+ lr: 0.005,
+ weight_decay: ff_weight_decay,
+ type: 'adamw',
+ },
+ },
+ },
+ checkpointer: {
+ keep_most_recent_by_count: 1,
+ },
+ callbacks: [
+ 'track_epoch_callback',
+ 'slurm',
+ ] + (
+ if use_wandb then [
+ {
+ type: 'wandb_allennlp',
+ sub_callbacks: [{ type: 'log_best_validation_metrics', priority: 100 }],
+ save_model_archive: false,
+ },
+ ]
+ else []
+ ),
+ inner_mode: 'score_nn',
+ num_steps: { task_nn: task_nn_steps, score_nn: score_nn_steps },
+ },
+}