Update edge probing runner and tokenization scripts (#858)

* Update NFS paths * Remove unused experiment scripts * Fix edge probing configs * Add username to Kubernetes jobs Plays nicer on multi-user environments, now everything is written to /nfs/jiant/exp/$USER/$PROJECT_NAME and jobs are prefixed with $USER * Move args to top of retokenize script for better documentation * Update retokenize_bert.sh * Update edge probing experiment script * Add username to job name and project dir for Kubernetes runs Better default behavior on a multi-user cluster. * Formatter
nyu-mll · Jul 22, 2019 · 7ff148e · 7ff148e
1 parent 5804e55
commit 7ff148e
Show file tree

Hide file tree

Showing 9 changed files with 66 additions and 211 deletions.
diff --git a/config/edgeprobe/edgeprobe_bare.conf b/config/edgeprobe/edgeprobe_bare.conf
@@ -24,8 +24,6 @@ lr_patience = 5 // vals until LR decay
 patience = 20 // vals until early-stopping
 
 tokenizer = "" // use native tokenization with ELMo
-elmo = 1
-elmo_chars_only = 1 // set to 0 to use full ELMo
 
 // Use no-op encoder (no params).
 sent_enc = "none"

diff --git a/probing/retokenize_bert.sh b/probing/retokenize_bert.sh
@@ -2,34 +2,39 @@
 
 set -eu
 
-EDGE_DATA_PATH="/nfs/jsalt/share/glue_data/edges"
+EDGE_DATA_PATH="$JIANT_DATA_DIR/edges"
+echo "Processing edge probing data in $EDGE_DATA_PATH"
 
 declare -a SUBPATHS
-SUBPATHS+=( "ontonotes-constituents" )
-SUBPATHS+=( "dep_ewt" )
-SUBPATHS+=( "ontonotes-ner" )
-SUBPATHS+=( "srl_conll2012" )
-SUBPATHS+=( "ontonotes-coref-conll" )
 SUBPATHS+=( "spr1" )
 SUBPATHS+=( "spr2" )
 SUBPATHS+=( "dpr" )
+SUBPATHS+=( "dep_ewt" )
+SUBPATHS+=( "ontonotes/const/pos" )
+SUBPATHS+=( "ontonotes/const/nonterminal" )
+SUBPATHS+=( "ontonotes/srl" )
+SUBPATHS+=( "ontonotes/ner" )
+SUBPATHS+=( "ontonotes/coref" )
+SUBPATHS+=( "semeval" )
+SUBPATHS+=( "tacred/rel" )
+SUBPATHS+=( "noun_verb" )
 
 for subpath in "${SUBPATHS[@]}"; do
- python $(dirname $0)/retokenize_edge_data.bert.py \
- --model bert-base-uncased $EDGE_DATA_PATH/$subpath/*.json &
- python $(dirname $0)/retokenize_edge_data.bert.py \
- --model bert-large-uncased $EDGE_DATA_PATH/$subpath/*.json &
+ python $(dirname $0)/retokenize_edge_data.py \
+ -t bert-base-uncased $EDGE_DATA_PATH/$subpath/*.json &
+ python $(dirname $0)/retokenize_edge_data.py \
+ -t bert-large-uncased $EDGE_DATA_PATH/$subpath/*.json &
 done
 
 # exit 0
 
 # Only use the cased model on NER, per https://arxiv.org/pdf/1810.04805.pdf
-CASED_SUBPATHS=( "ontonotes-ner" )
+CASED_SUBPATHS=( "ontonotes/ner" )
 
 for subpath in "${CASED_SUBPATHS[@]}"; do
- python $(dirname $0)/retokenize_edge_data.bert.py \
- --model bert-base-cased $EDGE_DATA_PATH/$subpath/*.json &
- python $(dirname $0)/retokenize_edge_data.bert.py \
- --model bert-large-cased $EDGE_DATA_PATH/$subpath/*.json &
+ python $(dirname $0)/retokenize_edge_data.py \
+ -t bert-base-cased $EDGE_DATA_PATH/$subpath/*.json &
+ python $(dirname $0)/retokenize_edge_data.py \
+ -t bert-large-cased $EDGE_DATA_PATH/$subpath/*.json &
 done
 
diff --git a/probing/retokenize_edge_data.py b/probing/retokenize_edge_data.py
@@ -41,6 +41,12 @@
 
 log.basicConfig(format="%(asctime)s: %(message)s", datefmt="%m/%d %I:%M:%S %p", level=log.INFO)
 
+PARSER = argparse.ArgumentParser()
+PARSER.add_argument("-t", dest="tokenizer_name", type=str, required=True, help="Tokenizer name.")
+PARSER.add_argument(
+ "--num_parallel", type=int, default=4, help="Number of parallel processes to use."
+)
+PARSER.add_argument("inputs", type=str, nargs="+", help="Input JSON files.")
 
 # For now, this module expects MosesTokenizer as the default.
 # TODO: change this once we have better support in core utils.
@@ -81,15 +87,7 @@ def retokenize_file(fname, tokenizer_name, worker_pool):
 
 
 def main(args):
- parser = argparse.ArgumentParser()
- parser.add_argument(
- "-t", dest="tokenizer_name", type=str, required=True, help="Tokenizer name."
- )
- parser.add_argument(
- "--num_parallel", type=int, default=4, help="Number of parallel processes to use."
- )
- parser.add_argument("inputs", type=str, nargs="+", help="Input JSON files.")
- args = parser.parse_args(args)
+ args = PARSER.parse_args(args)
 
  worker_pool = multiprocessing.Pool(args.num_parallel)
  for fname in args.inputs:

diff --git a/scripts/edgeprobing/exp_fns.sh b/scripts/edgeprobing/exp_fns.sh
@@ -14,6 +14,8 @@
 #
 # See individual functions below for usage.
 
+EP_RESOURCE_DIR="/nfs/jiant/share/edge-probing/resources"
+
 function run_exp() {
  # Helper function to invoke main.py.
  # Don't run this directly - use the experiment functions below,
@@ -34,34 +36,34 @@ function elmo_chars_exp() {
  # Lexical baseline, probe ELMo char CNN layer.
  # Usage: elmo_chars_exp <task_name>
  OVERRIDES="exp_name=elmo-chars-$1, run_name=run"
- OVERRIDES+=", target_tasks=$1"
+ OVERRIDES+=", target_tasks=$1, input_module=elmo-chars-only"
  run_exp "config/edgeprobe/edgeprobe_bare.conf" "${OVERRIDES}"
 }
 
 function elmo_full_exp() {
  # Full ELMo, probe full ELMo with learned mixing weights.
  # Usage: elmo_full_exp <task_name>
  OVERRIDES="exp_name=elmo-full-$1, run_name=run"
- OVERRIDES+=", target_tasks=$1, elmo_chars_only=0"
+ OVERRIDES+=", target_tasks=$1, input_module=elmo"
  run_exp "config/edgeprobe/edgeprobe_bare.conf" "${OVERRIDES}"
 }
 
 function elmo_ortho_exp() {
  # Full ELMo with random orthogonal weights for LSTM and projections.
  # Usage: elmo_ortho_exp <task_name> <random_seed>
- ELMO_WEIGHTS_PATH="/nfs/jsalt/share/random_elmo/elmo_2x4096_512_2048cnn_2xhighway_weights_ortho_seed_$2.hdf5"
+ ELMO_WEIGHTS_PATH="${EP_RESOURCE_DIR}/random_elmo/elmo_2x4096_512_2048cnn_2xhighway_weights_ortho_seed_$2.hdf5"
  OVERRIDES="exp_name=elmo-ortho-$1, run_name=run_seed_$2"
- OVERRIDES+=", target_tasks=$1, elmo_chars_only=0"
+ OVERRIDES+=", target_tasks=$1, input_module=elmo"
  OVERRIDES+=", elmo_weight_file_path=${ELMO_WEIGHTS_PATH}"
  run_exp "config/edgeprobe/edgeprobe_bare.conf" "${OVERRIDES}"
 }
 
 function elmo_random_exp() {
  # Full ELMo with random normal weights for LSTM and projections.
  # Usage: elmo_random_exp <task_name> <random_seed>
- ELMO_WEIGHTS_PATH="/nfs/jsalt/share/random_elmo/elmo_2x4096_512_2048cnn_2xhighway_weights_random_seed_$2.hdf5"
+ ELMO_WEIGHTS_PATH="${EP_RESOURCE_DIR}/random_elmo/elmo_2x4096_512_2048cnn_2xhighway_weights_random_seed_$2.hdf5"
  OVERRIDES="exp_name=elmo-random-$1, run_name=run_seed_$2"
- OVERRIDES+=", target_tasks=$1, elmo_chars_only=0"
+ OVERRIDES+=", target_tasks=$1, input_module=elmo"
  OVERRIDES+=", elmo_weight_file_path=${ELMO_WEIGHTS_PATH}"
  run_exp "config/edgeprobe/edgeprobe_bare.conf" "${OVERRIDES}"
 }
@@ -71,14 +73,16 @@ function train_chars_exp() {
  # Usage: train_chars_exp <task_name> <max_vals> <val_interval>
  OVERRIDES="exp_name=train-chars-$1, run_name=run"
  OVERRIDES+=", pretrain_tasks=$1, max_vals=$2, val_interval=$3"
+ OVERRIDES+=", input_module=elmo-chars-only"
  run_exp "config/edgeprobe/edgeprobe_train.conf" "${OVERRIDES}"
 }
 
 function train_full_exp() {
  # Trained encoder over full ELMo.
  # Usage: train_full_exp <task_name> <max_vals> <val_interval>
  OVERRIDES="exp_name=train-full-$1, run_name=run"
- OVERRIDES+=", pretrain_tasks=$1, max_vals=$2, val_interval=$3, elmo_chars_only=0"
+ OVERRIDES+=", pretrain_tasks=$1, max_vals=$2, val_interval=$3"
+ OVERRIDES+=", input_module=elmo"
  run_exp "config/edgeprobe/edgeprobe_train.conf" "${OVERRIDES}"
 }
 
@@ -140,7 +144,7 @@ function openai_mix_exp() {
 function openai_bwb_exp() {
  # Probe the OpenAI transformer model, as trained on BWB-shuffled.
  # Usage: openai_bwb_exp <task_name>
- CKPT_PATH="/nfs/jsalt/home/iftenney/checkpoints/bwb_shuffled/model.ckpt-1000000"
+ CKPT_PATH="${EP_RESOURCE_DIR}/checkpoints/bwb_shuffled/model.ckpt-1000000"
  OVERRIDES="exp_name=openai-bwb-$1, run_name=run"
  OVERRIDES+=", target_tasks=$1"
  OVERRIDES+=", openai_transformer_ckpt=${CKPT_PATH}"
@@ -157,7 +161,7 @@ function bert_cat_exp() {
  # Usage: bert_cat_exp <task_name>
  OVERRIDES="exp_name=bert-${2}-cat-${1}, run_name=run"
  OVERRIDES+=", target_tasks=$1"
- OVERRIDES+=", bert_model_name=bert-$2"
+ OVERRIDES+=", input_module=bert-$2"
  OVERRIDES+=", bert_embeddings_mode=cat"
  run_exp "config/edgeprobe/edgeprobe_bert.conf" "${OVERRIDES}"
 }
@@ -167,7 +171,7 @@ function bert_lex_exp() {
  # Usage: bert_lex_exp <task_name>
  OVERRIDES="exp_name=bert-${2}-lex-${1}, run_name=run"
  OVERRIDES+=", target_tasks=$1"
- OVERRIDES+=", bert_model_name=bert-$2"
+ OVERRIDES+=", input_module=bert-$2"
  OVERRIDES+=", bert_embeddings_mode=only"
  run_exp "config/edgeprobe/edgeprobe_bert.conf" "${OVERRIDES}"
 }
@@ -177,7 +181,7 @@ function bert_mix_exp() {
  # Usage: bert_mix_exp <task_name>
  OVERRIDES="exp_name=bert-${2}-mix-${1}, run_name=run"
  OVERRIDES+=", target_tasks=$1"
- OVERRIDES+=", bert_model_name=bert-$2"
+ OVERRIDES+=", input_module=bert-$2"
  OVERRIDES+=", bert_embeddings_mode=mix"
  run_exp "config/edgeprobe/edgeprobe_bert.conf" "${OVERRIDES}"
 }
diff --git a/scripts/edgeprobing/kubernetes_run_all.sh b/scripts/edgeprobing/kubernetes_run_all.sh
@@ -12,14 +12,14 @@
 set -e
 
 # Default arguments.
-PROJECT=""
+PROJECT_NAME=""
 NOTIFY_EMAIL="$NOTIFY_EMAIL" # allow pre-set from shell
 
 # Handle flags.
 OPTIND=1 # Reset in case getopts has been used previously in the shell.
 while getopts ":p:n:" opt; do
  case "$opt" in
- p) PROJECT=$OPTARG
+ p) PROJECT_NAME=$OPTARG
  ;;
  n) NOTIFY_EMAIL=$OPTARG
  ;;
@@ -34,7 +34,7 @@ shift $((OPTIND-1))
 # Remaining positional arguments.
 MODE=${1:-"create"}
 
-if [ -z $PROJECT ]; then
+if [ -z $PROJECT_NAME ]; then
  echo "You must provide a project name!"
  exit 1
 fi
@@ -47,8 +47,18 @@ fi
 # Top-level directory for the current repo.
 pushd $(git rev-parse --show-toplevel)
 
+# Get the NFS path from the Kubernetes config, so that it doesn't need to be
+# hardcoded here.
+pushd gcp/kubernetes/templates
+NFS_EXP_DIR=$(jsonnet -S -e "local env = import 'jiant_env.libsonnet'; env.nfs_exp_dir")
+echo "Assuming NFS experiment path at $NFS_EXP_DIR"
+popd
+
 # Make a copy of the current tree in the project directory.
-PROJECT_DIR="/nfs/jsalt/exp/$PROJECT"
+PROJECT_DIR="${NFS_EXP_DIR}/${USER}/${PROJECT_NAME}"
+if [ ! -d "${NFS_EXP_DIR}/$USER" ]; then
+ mkdir "${NFS_EXP_DIR}/$USER"
+fi
 if [ ! -d "${PROJECT_DIR}" ]; then
  echo "Creating project directory ${PROJECT_DIR}"
  mkdir ${PROJECT_DIR}
@@ -65,7 +75,7 @@ function make_kubernetes_command() {
  # Uses exp_fns.sh to generate configs; see that file for details
  # and to define new experiments.
  echo -n "pushd ${PATH_TO_JIANT}"
- echo -n "; source scripts/edges/exp_fns.sh"
+ echo -n "; source scripts/edgeprobing/exp_fns.sh"
  echo -n "; $@"
 }
 
@@ -74,8 +84,9 @@ function kuberun() {
  NAME=$1
  COMMAND=$(make_kubernetes_command $2)
  echo "Job '$NAME': '$COMMAND'"
- ./gcp/kubernetes/run_batch.sh -m $MODE -p ${PROJECT} -g ${GPU_TYPE} \
- -n ${NOTIFY_EMAIL} $NAME "$COMMAND"
+ ./gcp/kubernetes/run_batch.sh -m $MODE -p ${PROJECT_NAME} -g ${GPU_TYPE} \
+ $NAME "$COMMAND"
+ # -n ${NOTIFY_EMAIL} \ # Temporarily disabled
  echo ""
 }
 
@@ -103,14 +114,14 @@ if [[ $MODE == "delete" ]]; then
 fi
 
 ##
-# Run these on the main 'jsalt' cluster
-gcloud container clusters get-credentials --zone us-east1-c jsalt
+# Run these on p100s (default)
 export GPU_TYPE="p100"
 for task in "${ALL_TASKS[@]}"
 do
- kuberun elmo-chars-$task "elmo_chars_exp edges-$task"
- kuberun elmo-ortho-$task "elmo_ortho_exp edges-$task 0"
- kuberun elmo-full-$task "elmo_full_exp edges-$task"
+ # ELMo is currently broken at master, so skip these.
+ # kuberun elmo-chars-$task "elmo_chars_exp edges-$task"
+ # kuberun elmo-ortho-$task "elmo_ortho_exp edges-$task 0"
+ # kuberun elmo-full-$task "elmo_full_exp edges-$task"
  kuberun glove-$task "glove_exp edges-$task"
  kuberun cove-$task "cove_exp edges-$task"
 
@@ -120,8 +131,7 @@ do
 done
 
 ##
-# Run these on 'jsalt-central' for V100s
-gcloud container clusters get-credentials --zone us-central1-a jsalt-central
+# Run the larger experiments (transformers) on v100s
 export GPU_TYPE="v100"
 for task in "${ALL_TASKS[@]}"
 do

diff --git a/scripts/edgeprobing/ortho_elmo_baselines.sh b/scripts/edgeprobing/ortho_elmo_baselines.sh
diff --git a/scripts/edgeprobing/random_elmo_baselines.sh b/scripts/edgeprobing/random_elmo_baselines.sh